1 /* 2 * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "code/compiledIC.hpp" 28 #include "compiler/compiler_globals.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "ci/ciInlineKlass.hpp" 31 #include "crc32c.h" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/collectedHeap.inline.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/bytecodeHistogram.hpp" 37 #include "interpreter/interpreter.hpp" 38 #include "interpreter/interpreterRuntime.hpp" 39 #include "jvm.h" 40 #include "memory/resourceArea.hpp" 41 #include "memory/universe.hpp" 42 #include "oops/accessDecorators.hpp" 43 #include "oops/compressedKlass.inline.hpp" 44 #include "oops/compressedOops.inline.hpp" 45 #include "oops/klass.inline.hpp" 46 #include "oops/resolvedFieldEntry.hpp" 47 #include "prims/methodHandles.hpp" 48 #include "runtime/continuation.hpp" 49 #include "runtime/interfaceSupport.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/jniHandles.hpp" 52 #include "runtime/objectMonitor.hpp" 53 #include "runtime/os.hpp" 54 #include "runtime/safepoint.hpp" 55 #include "runtime/safepointMechanism.hpp" 56 #include "runtime/sharedRuntime.hpp" 57 #include "runtime/signature_cc.hpp" 58 #include "runtime/stubRoutines.hpp" 59 #include "utilities/checkedCast.hpp" 60 #include "utilities/macros.hpp" 61 #include "vmreg_x86.inline.hpp" 62 #ifdef COMPILER2 63 #include "opto/output.hpp" 64 #endif 65 66 #ifdef PRODUCT 67 #define BLOCK_COMMENT(str) /* nothing */ 68 #define STOP(error) stop(error) 69 #else 70 #define BLOCK_COMMENT(str) block_comment(str) 71 #define STOP(error) block_comment(error); stop(error) 72 #endif 73 74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 75 76 #ifdef ASSERT 77 bool AbstractAssembler::pd_check_instruction_mark() { return true; } 78 #endif 79 80 static const Assembler::Condition reverse[] = { 81 Assembler::noOverflow /* overflow = 0x0 */ , 82 Assembler::overflow /* noOverflow = 0x1 */ , 83 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , 84 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , 85 Assembler::notZero /* zero = 0x4, equal = 0x4 */ , 86 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , 87 Assembler::above /* belowEqual = 0x6 */ , 88 Assembler::belowEqual /* above = 0x7 */ , 89 Assembler::positive /* negative = 0x8 */ , 90 Assembler::negative /* positive = 0x9 */ , 91 Assembler::noParity /* parity = 0xa */ , 92 Assembler::parity /* noParity = 0xb */ , 93 Assembler::greaterEqual /* less = 0xc */ , 94 Assembler::less /* greaterEqual = 0xd */ , 95 Assembler::greater /* lessEqual = 0xe */ , 96 Assembler::lessEqual /* greater = 0xf, */ 97 98 }; 99 100 101 // Implementation of MacroAssembler 102 103 // First all the versions that have distinct versions depending on 32/64 bit 104 // Unless the difference is trivial (1 line or so). 105 106 #ifndef _LP64 107 108 // 32bit versions 109 110 Address MacroAssembler::as_Address(AddressLiteral adr) { 111 return Address(adr.target(), adr.rspec()); 112 } 113 114 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) { 115 assert(rscratch == noreg, ""); 116 return Address::make_array(adr); 117 } 118 119 void MacroAssembler::call_VM_leaf_base(address entry_point, 120 int number_of_arguments) { 121 call(RuntimeAddress(entry_point)); 122 increment(rsp, number_of_arguments * wordSize); 123 } 124 125 void MacroAssembler::cmpklass(Address src1, Metadata* obj) { 126 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 127 } 128 129 130 void MacroAssembler::cmpklass(Register src1, Metadata* obj) { 131 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 132 } 133 134 void MacroAssembler::cmpoop(Address src1, jobject obj) { 135 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 136 } 137 138 void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) { 139 assert(rscratch == noreg, "redundant"); 140 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 141 } 142 143 void MacroAssembler::extend_sign(Register hi, Register lo) { 144 // According to Intel Doc. AP-526, "Integer Divide", p.18. 145 if (VM_Version::is_P6() && hi == rdx && lo == rax) { 146 cdql(); 147 } else { 148 movl(hi, lo); 149 sarl(hi, 31); 150 } 151 } 152 153 void MacroAssembler::jC2(Register tmp, Label& L) { 154 // set parity bit if FPU flag C2 is set (via rax) 155 save_rax(tmp); 156 fwait(); fnstsw_ax(); 157 sahf(); 158 restore_rax(tmp); 159 // branch 160 jcc(Assembler::parity, L); 161 } 162 163 void MacroAssembler::jnC2(Register tmp, Label& L) { 164 // set parity bit if FPU flag C2 is set (via rax) 165 save_rax(tmp); 166 fwait(); fnstsw_ax(); 167 sahf(); 168 restore_rax(tmp); 169 // branch 170 jcc(Assembler::noParity, L); 171 } 172 173 // 32bit can do a case table jump in one instruction but we no longer allow the base 174 // to be installed in the Address class 175 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) { 176 assert(rscratch == noreg, "not needed"); 177 jmp(as_Address(entry, noreg)); 178 } 179 180 // Note: y_lo will be destroyed 181 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 182 // Long compare for Java (semantics as described in JVM spec.) 183 Label high, low, done; 184 185 cmpl(x_hi, y_hi); 186 jcc(Assembler::less, low); 187 jcc(Assembler::greater, high); 188 // x_hi is the return register 189 xorl(x_hi, x_hi); 190 cmpl(x_lo, y_lo); 191 jcc(Assembler::below, low); 192 jcc(Assembler::equal, done); 193 194 bind(high); 195 xorl(x_hi, x_hi); 196 increment(x_hi); 197 jmp(done); 198 199 bind(low); 200 xorl(x_hi, x_hi); 201 decrementl(x_hi); 202 203 bind(done); 204 } 205 206 void MacroAssembler::lea(Register dst, AddressLiteral src) { 207 mov_literal32(dst, (int32_t)src.target(), src.rspec()); 208 } 209 210 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) { 211 assert(rscratch == noreg, "not needed"); 212 213 // leal(dst, as_Address(adr)); 214 // see note in movl as to why we must use a move 215 mov_literal32(dst, (int32_t)adr.target(), adr.rspec()); 216 } 217 218 void MacroAssembler::leave() { 219 mov(rsp, rbp); 220 pop(rbp); 221 } 222 223 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { 224 // Multiplication of two Java long values stored on the stack 225 // as illustrated below. Result is in rdx:rax. 226 // 227 // rsp ---> [ ?? ] \ \ 228 // .... | y_rsp_offset | 229 // [ y_lo ] / (in bytes) | x_rsp_offset 230 // [ y_hi ] | (in bytes) 231 // .... | 232 // [ x_lo ] / 233 // [ x_hi ] 234 // .... 235 // 236 // Basic idea: lo(result) = lo(x_lo * y_lo) 237 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 238 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); 239 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); 240 Label quick; 241 // load x_hi, y_hi and check if quick 242 // multiplication is possible 243 movl(rbx, x_hi); 244 movl(rcx, y_hi); 245 movl(rax, rbx); 246 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 247 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply 248 // do full multiplication 249 // 1st step 250 mull(y_lo); // x_hi * y_lo 251 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, 252 // 2nd step 253 movl(rax, x_lo); 254 mull(rcx); // x_lo * y_hi 255 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, 256 // 3rd step 257 bind(quick); // note: rbx, = 0 if quick multiply! 258 movl(rax, x_lo); 259 mull(y_lo); // x_lo * y_lo 260 addl(rdx, rbx); // correct hi(x_lo * y_lo) 261 } 262 263 void MacroAssembler::lneg(Register hi, Register lo) { 264 negl(lo); 265 adcl(hi, 0); 266 negl(hi); 267 } 268 269 void MacroAssembler::lshl(Register hi, Register lo) { 270 // Java shift left long support (semantics as described in JVM spec., p.305) 271 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 272 // shift value is in rcx ! 273 assert(hi != rcx, "must not use rcx"); 274 assert(lo != rcx, "must not use rcx"); 275 const Register s = rcx; // shift count 276 const int n = BitsPerWord; 277 Label L; 278 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 279 cmpl(s, n); // if (s < n) 280 jcc(Assembler::less, L); // else (s >= n) 281 movl(hi, lo); // x := x << n 282 xorl(lo, lo); 283 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 284 bind(L); // s (mod n) < n 285 shldl(hi, lo); // x := x << s 286 shll(lo); 287 } 288 289 290 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { 291 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 292 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 293 assert(hi != rcx, "must not use rcx"); 294 assert(lo != rcx, "must not use rcx"); 295 const Register s = rcx; // shift count 296 const int n = BitsPerWord; 297 Label L; 298 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 299 cmpl(s, n); // if (s < n) 300 jcc(Assembler::less, L); // else (s >= n) 301 movl(lo, hi); // x := x >> n 302 if (sign_extension) sarl(hi, 31); 303 else xorl(hi, hi); 304 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 305 bind(L); // s (mod n) < n 306 shrdl(lo, hi); // x := x >> s 307 if (sign_extension) sarl(hi); 308 else shrl(hi); 309 } 310 311 void MacroAssembler::movoop(Register dst, jobject obj) { 312 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 313 } 314 315 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) { 316 assert(rscratch == noreg, "redundant"); 317 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 318 } 319 320 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 321 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 322 } 323 324 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) { 325 assert(rscratch == noreg, "redundant"); 326 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 327 } 328 329 void MacroAssembler::movptr(Register dst, AddressLiteral src) { 330 if (src.is_lval()) { 331 mov_literal32(dst, (intptr_t)src.target(), src.rspec()); 332 } else { 333 movl(dst, as_Address(src)); 334 } 335 } 336 337 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) { 338 assert(rscratch == noreg, "redundant"); 339 movl(as_Address(dst, noreg), src); 340 } 341 342 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 343 movl(dst, as_Address(src, noreg)); 344 } 345 346 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { 347 assert(rscratch == noreg, "redundant"); 348 movl(dst, src); 349 } 350 351 void MacroAssembler::pushoop(jobject obj, Register rscratch) { 352 assert(rscratch == noreg, "redundant"); 353 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); 354 } 355 356 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) { 357 assert(rscratch == noreg, "redundant"); 358 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); 359 } 360 361 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) { 362 assert(rscratch == noreg, "redundant"); 363 if (src.is_lval()) { 364 push_literal32((int32_t)src.target(), src.rspec()); 365 } else { 366 pushl(as_Address(src)); 367 } 368 } 369 370 static void pass_arg0(MacroAssembler* masm, Register arg) { 371 masm->push(arg); 372 } 373 374 static void pass_arg1(MacroAssembler* masm, Register arg) { 375 masm->push(arg); 376 } 377 378 static void pass_arg2(MacroAssembler* masm, Register arg) { 379 masm->push(arg); 380 } 381 382 static void pass_arg3(MacroAssembler* masm, Register arg) { 383 masm->push(arg); 384 } 385 386 #ifndef PRODUCT 387 extern "C" void findpc(intptr_t x); 388 #endif 389 390 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { 391 // In order to get locks to work, we need to fake a in_VM state 392 JavaThread* thread = JavaThread::current(); 393 JavaThreadState saved_state = thread->thread_state(); 394 thread->set_thread_state(_thread_in_vm); 395 if (ShowMessageBoxOnError) { 396 JavaThread* thread = JavaThread::current(); 397 JavaThreadState saved_state = thread->thread_state(); 398 thread->set_thread_state(_thread_in_vm); 399 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 400 ttyLocker ttyl; 401 BytecodeCounter::print(); 402 } 403 // To see where a verify_oop failed, get $ebx+40/X for this frame. 404 // This is the value of eip which points to where verify_oop will return. 405 if (os::message_box(msg, "Execution stopped, print registers?")) { 406 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); 407 BREAKPOINT; 408 } 409 } 410 fatal("DEBUG MESSAGE: %s", msg); 411 } 412 413 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { 414 ttyLocker ttyl; 415 DebuggingContext debugging{}; 416 tty->print_cr("eip = 0x%08x", eip); 417 #ifndef PRODUCT 418 if ((WizardMode || Verbose) && PrintMiscellaneous) { 419 tty->cr(); 420 findpc(eip); 421 tty->cr(); 422 } 423 #endif 424 #define PRINT_REG(rax) \ 425 { tty->print("%s = ", #rax); os::print_location(tty, rax); } 426 PRINT_REG(rax); 427 PRINT_REG(rbx); 428 PRINT_REG(rcx); 429 PRINT_REG(rdx); 430 PRINT_REG(rdi); 431 PRINT_REG(rsi); 432 PRINT_REG(rbp); 433 PRINT_REG(rsp); 434 #undef PRINT_REG 435 // Print some words near top of staack. 436 int* dump_sp = (int*) rsp; 437 for (int col1 = 0; col1 < 8; col1++) { 438 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 439 os::print_location(tty, *dump_sp++); 440 } 441 for (int row = 0; row < 16; row++) { 442 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 443 for (int col = 0; col < 8; col++) { 444 tty->print(" 0x%08x", *dump_sp++); 445 } 446 tty->cr(); 447 } 448 // Print some instructions around pc: 449 Disassembler::decode((address)eip-64, (address)eip); 450 tty->print_cr("--------"); 451 Disassembler::decode((address)eip, (address)eip+32); 452 } 453 454 void MacroAssembler::stop(const char* msg) { 455 // push address of message 456 ExternalAddress message((address)msg); 457 pushptr(message.addr(), noreg); 458 { Label L; call(L, relocInfo::none); bind(L); } // push eip 459 pusha(); // push registers 460 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 461 hlt(); 462 } 463 464 void MacroAssembler::warn(const char* msg) { 465 push_CPU_state(); 466 467 // push address of message 468 ExternalAddress message((address)msg); 469 pushptr(message.addr(), noreg); 470 471 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 472 addl(rsp, wordSize); // discard argument 473 pop_CPU_state(); 474 } 475 476 void MacroAssembler::print_state() { 477 { Label L; call(L, relocInfo::none); bind(L); } // push eip 478 pusha(); // push registers 479 480 push_CPU_state(); 481 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); 482 pop_CPU_state(); 483 484 popa(); 485 addl(rsp, wordSize); 486 } 487 488 #else // _LP64 489 490 // 64 bit versions 491 492 Address MacroAssembler::as_Address(AddressLiteral adr) { 493 // amd64 always does this as a pc-rel 494 // we can be absolute or disp based on the instruction type 495 // jmp/call are displacements others are absolute 496 assert(!adr.is_lval(), "must be rval"); 497 assert(reachable(adr), "must be"); 498 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc()); 499 500 } 501 502 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) { 503 AddressLiteral base = adr.base(); 504 lea(rscratch, base); 505 Address index = adr.index(); 506 assert(index._disp == 0, "must not have disp"); // maybe it can? 507 Address array(rscratch, index._index, index._scale, index._disp); 508 return array; 509 } 510 511 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 512 Label L, E; 513 514 #ifdef _WIN64 515 // Windows always allocates space for it's register args 516 assert(num_args <= 4, "only register arguments supported"); 517 subq(rsp, frame::arg_reg_save_area_bytes); 518 #endif 519 520 // Align stack if necessary 521 testl(rsp, 15); 522 jcc(Assembler::zero, L); 523 524 subq(rsp, 8); 525 call(RuntimeAddress(entry_point)); 526 addq(rsp, 8); 527 jmp(E); 528 529 bind(L); 530 call(RuntimeAddress(entry_point)); 531 532 bind(E); 533 534 #ifdef _WIN64 535 // restore stack pointer 536 addq(rsp, frame::arg_reg_save_area_bytes); 537 #endif 538 } 539 540 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) { 541 assert(!src2.is_lval(), "should use cmpptr"); 542 assert(rscratch != noreg || always_reachable(src2), "missing"); 543 544 if (reachable(src2)) { 545 cmpq(src1, as_Address(src2)); 546 } else { 547 lea(rscratch, src2); 548 Assembler::cmpq(src1, Address(rscratch, 0)); 549 } 550 } 551 552 int MacroAssembler::corrected_idivq(Register reg) { 553 // Full implementation of Java ldiv and lrem; checks for special 554 // case as described in JVM spec., p.243 & p.271. The function 555 // returns the (pc) offset of the idivl instruction - may be needed 556 // for implicit exceptions. 557 // 558 // normal case special case 559 // 560 // input : rax: dividend min_long 561 // reg: divisor (may not be eax/edx) -1 562 // 563 // output: rax: quotient (= rax idiv reg) min_long 564 // rdx: remainder (= rax irem reg) 0 565 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); 566 static const int64_t min_long = 0x8000000000000000; 567 Label normal_case, special_case; 568 569 // check for special case 570 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/); 571 jcc(Assembler::notEqual, normal_case); 572 xorl(rdx, rdx); // prepare rdx for possible special case (where 573 // remainder = 0) 574 cmpq(reg, -1); 575 jcc(Assembler::equal, special_case); 576 577 // handle normal case 578 bind(normal_case); 579 cdqq(); 580 int idivq_offset = offset(); 581 idivq(reg); 582 583 // normal and special case exit 584 bind(special_case); 585 586 return idivq_offset; 587 } 588 589 void MacroAssembler::decrementq(Register reg, int value) { 590 if (value == min_jint) { subq(reg, value); return; } 591 if (value < 0) { incrementq(reg, -value); return; } 592 if (value == 0) { ; return; } 593 if (value == 1 && UseIncDec) { decq(reg) ; return; } 594 /* else */ { subq(reg, value) ; return; } 595 } 596 597 void MacroAssembler::decrementq(Address dst, int value) { 598 if (value == min_jint) { subq(dst, value); return; } 599 if (value < 0) { incrementq(dst, -value); return; } 600 if (value == 0) { ; return; } 601 if (value == 1 && UseIncDec) { decq(dst) ; return; } 602 /* else */ { subq(dst, value) ; return; } 603 } 604 605 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) { 606 assert(rscratch != noreg || always_reachable(dst), "missing"); 607 608 if (reachable(dst)) { 609 incrementq(as_Address(dst)); 610 } else { 611 lea(rscratch, dst); 612 incrementq(Address(rscratch, 0)); 613 } 614 } 615 616 void MacroAssembler::incrementq(Register reg, int value) { 617 if (value == min_jint) { addq(reg, value); return; } 618 if (value < 0) { decrementq(reg, -value); return; } 619 if (value == 0) { ; return; } 620 if (value == 1 && UseIncDec) { incq(reg) ; return; } 621 /* else */ { addq(reg, value) ; return; } 622 } 623 624 void MacroAssembler::incrementq(Address dst, int value) { 625 if (value == min_jint) { addq(dst, value); return; } 626 if (value < 0) { decrementq(dst, -value); return; } 627 if (value == 0) { ; return; } 628 if (value == 1 && UseIncDec) { incq(dst) ; return; } 629 /* else */ { addq(dst, value) ; return; } 630 } 631 632 // 32bit can do a case table jump in one instruction but we no longer allow the base 633 // to be installed in the Address class 634 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) { 635 lea(rscratch, entry.base()); 636 Address dispatch = entry.index(); 637 assert(dispatch._base == noreg, "must be"); 638 dispatch._base = rscratch; 639 jmp(dispatch); 640 } 641 642 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 643 ShouldNotReachHere(); // 64bit doesn't use two regs 644 cmpq(x_lo, y_lo); 645 } 646 647 void MacroAssembler::lea(Register dst, AddressLiteral src) { 648 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 649 } 650 651 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) { 652 lea(rscratch, adr); 653 movptr(dst, rscratch); 654 } 655 656 void MacroAssembler::leave() { 657 // %%% is this really better? Why not on 32bit too? 658 emit_int8((unsigned char)0xC9); // LEAVE 659 } 660 661 void MacroAssembler::lneg(Register hi, Register lo) { 662 ShouldNotReachHere(); // 64bit doesn't use two regs 663 negq(lo); 664 } 665 666 void MacroAssembler::movoop(Register dst, jobject obj) { 667 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 668 } 669 670 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) { 671 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 672 movq(dst, rscratch); 673 } 674 675 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 676 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 677 } 678 679 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) { 680 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 681 movq(dst, rscratch); 682 } 683 684 void MacroAssembler::movptr(Register dst, AddressLiteral src) { 685 if (src.is_lval()) { 686 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 687 } else { 688 if (reachable(src)) { 689 movq(dst, as_Address(src)); 690 } else { 691 lea(dst, src); 692 movq(dst, Address(dst, 0)); 693 } 694 } 695 } 696 697 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) { 698 movq(as_Address(dst, rscratch), src); 699 } 700 701 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 702 movq(dst, as_Address(src, dst /*rscratch*/)); 703 } 704 705 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 706 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { 707 if (is_simm32(src)) { 708 movptr(dst, checked_cast<int32_t>(src)); 709 } else { 710 mov64(rscratch, src); 711 movq(dst, rscratch); 712 } 713 } 714 715 void MacroAssembler::pushoop(jobject obj, Register rscratch) { 716 movoop(rscratch, obj); 717 push(rscratch); 718 } 719 720 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) { 721 mov_metadata(rscratch, obj); 722 push(rscratch); 723 } 724 725 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) { 726 lea(rscratch, src); 727 if (src.is_lval()) { 728 push(rscratch); 729 } else { 730 pushq(Address(rscratch, 0)); 731 } 732 } 733 734 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 735 reset_last_Java_frame(r15_thread, clear_fp); 736 } 737 738 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 739 Register last_java_fp, 740 address last_java_pc, 741 Register rscratch) { 742 set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, last_java_pc, rscratch); 743 } 744 745 static void pass_arg0(MacroAssembler* masm, Register arg) { 746 if (c_rarg0 != arg ) { 747 masm->mov(c_rarg0, arg); 748 } 749 } 750 751 static void pass_arg1(MacroAssembler* masm, Register arg) { 752 if (c_rarg1 != arg ) { 753 masm->mov(c_rarg1, arg); 754 } 755 } 756 757 static void pass_arg2(MacroAssembler* masm, Register arg) { 758 if (c_rarg2 != arg ) { 759 masm->mov(c_rarg2, arg); 760 } 761 } 762 763 static void pass_arg3(MacroAssembler* masm, Register arg) { 764 if (c_rarg3 != arg ) { 765 masm->mov(c_rarg3, arg); 766 } 767 } 768 769 void MacroAssembler::stop(const char* msg) { 770 if (ShowMessageBoxOnError) { 771 address rip = pc(); 772 pusha(); // get regs on stack 773 lea(c_rarg1, InternalAddress(rip)); 774 movq(c_rarg2, rsp); // pass pointer to regs array 775 } 776 lea(c_rarg0, ExternalAddress((address) msg)); 777 andq(rsp, -16); // align stack as required by ABI 778 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 779 hlt(); 780 } 781 782 void MacroAssembler::warn(const char* msg) { 783 push(rbp); 784 movq(rbp, rsp); 785 andq(rsp, -16); // align stack as required by push_CPU_state and call 786 push_CPU_state(); // keeps alignment at 16 bytes 787 788 #ifdef _WIN64 789 // Windows always allocates space for its register args 790 subq(rsp, frame::arg_reg_save_area_bytes); 791 #endif 792 lea(c_rarg0, ExternalAddress((address) msg)); 793 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 794 795 #ifdef _WIN64 796 // restore stack pointer 797 addq(rsp, frame::arg_reg_save_area_bytes); 798 #endif 799 pop_CPU_state(); 800 mov(rsp, rbp); 801 pop(rbp); 802 } 803 804 void MacroAssembler::print_state() { 805 address rip = pc(); 806 pusha(); // get regs on stack 807 push(rbp); 808 movq(rbp, rsp); 809 andq(rsp, -16); // align stack as required by push_CPU_state and call 810 push_CPU_state(); // keeps alignment at 16 bytes 811 812 lea(c_rarg0, InternalAddress(rip)); 813 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array 814 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); 815 816 pop_CPU_state(); 817 mov(rsp, rbp); 818 pop(rbp); 819 popa(); 820 } 821 822 #ifndef PRODUCT 823 extern "C" void findpc(intptr_t x); 824 #endif 825 826 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { 827 // In order to get locks to work, we need to fake a in_VM state 828 if (ShowMessageBoxOnError) { 829 JavaThread* thread = JavaThread::current(); 830 JavaThreadState saved_state = thread->thread_state(); 831 thread->set_thread_state(_thread_in_vm); 832 #ifndef PRODUCT 833 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 834 ttyLocker ttyl; 835 BytecodeCounter::print(); 836 } 837 #endif 838 // To see where a verify_oop failed, get $ebx+40/X for this frame. 839 // XXX correct this offset for amd64 840 // This is the value of eip which points to where verify_oop will return. 841 if (os::message_box(msg, "Execution stopped, print registers?")) { 842 print_state64(pc, regs); 843 BREAKPOINT; 844 } 845 } 846 fatal("DEBUG MESSAGE: %s", msg); 847 } 848 849 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { 850 ttyLocker ttyl; 851 DebuggingContext debugging{}; 852 tty->print_cr("rip = 0x%016lx", (intptr_t)pc); 853 #ifndef PRODUCT 854 tty->cr(); 855 findpc(pc); 856 tty->cr(); 857 #endif 858 #define PRINT_REG(rax, value) \ 859 { tty->print("%s = ", #rax); os::print_location(tty, value); } 860 PRINT_REG(rax, regs[15]); 861 PRINT_REG(rbx, regs[12]); 862 PRINT_REG(rcx, regs[14]); 863 PRINT_REG(rdx, regs[13]); 864 PRINT_REG(rdi, regs[8]); 865 PRINT_REG(rsi, regs[9]); 866 PRINT_REG(rbp, regs[10]); 867 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp 868 PRINT_REG(rsp, (intptr_t)(®s[16])); 869 PRINT_REG(r8 , regs[7]); 870 PRINT_REG(r9 , regs[6]); 871 PRINT_REG(r10, regs[5]); 872 PRINT_REG(r11, regs[4]); 873 PRINT_REG(r12, regs[3]); 874 PRINT_REG(r13, regs[2]); 875 PRINT_REG(r14, regs[1]); 876 PRINT_REG(r15, regs[0]); 877 #undef PRINT_REG 878 // Print some words near the top of the stack. 879 int64_t* rsp = ®s[16]; 880 int64_t* dump_sp = rsp; 881 for (int col1 = 0; col1 < 8; col1++) { 882 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 883 os::print_location(tty, *dump_sp++); 884 } 885 for (int row = 0; row < 25; row++) { 886 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 887 for (int col = 0; col < 4; col++) { 888 tty->print(" 0x%016lx", (intptr_t)*dump_sp++); 889 } 890 tty->cr(); 891 } 892 // Print some instructions around pc: 893 Disassembler::decode((address)pc-64, (address)pc); 894 tty->print_cr("--------"); 895 Disassembler::decode((address)pc, (address)pc+32); 896 } 897 898 // The java_calling_convention describes stack locations as ideal slots on 899 // a frame with no abi restrictions. Since we must observe abi restrictions 900 // (like the placement of the register window) the slots must be biased by 901 // the following value. 902 static int reg2offset_in(VMReg r) { 903 // Account for saved rbp and return address 904 // This should really be in_preserve_stack_slots 905 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size; 906 } 907 908 static int reg2offset_out(VMReg r) { 909 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size; 910 } 911 912 // A long move 913 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 914 915 // The calling conventions assures us that each VMregpair is either 916 // all really one physical register or adjacent stack slots. 917 918 if (src.is_single_phys_reg() ) { 919 if (dst.is_single_phys_reg()) { 920 if (dst.first() != src.first()) { 921 mov(dst.first()->as_Register(), src.first()->as_Register()); 922 } 923 } else { 924 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)", 925 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name()); 926 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register()); 927 } 928 } else if (dst.is_single_phys_reg()) { 929 assert(src.is_single_reg(), "not a stack pair"); 930 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 931 } else { 932 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 933 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 934 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 935 } 936 } 937 938 // A double move 939 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 940 941 // The calling conventions assures us that each VMregpair is either 942 // all really one physical register or adjacent stack slots. 943 944 if (src.is_single_phys_reg() ) { 945 if (dst.is_single_phys_reg()) { 946 // In theory these overlap but the ordering is such that this is likely a nop 947 if ( src.first() != dst.first()) { 948 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 949 } 950 } else { 951 assert(dst.is_single_reg(), "not a stack pair"); 952 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister()); 953 } 954 } else if (dst.is_single_phys_reg()) { 955 assert(src.is_single_reg(), "not a stack pair"); 956 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 957 } else { 958 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 959 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 960 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 961 } 962 } 963 964 965 // A float arg may have to do float reg int reg conversion 966 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 967 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move"); 968 969 // The calling conventions assures us that each VMregpair is either 970 // all really one physical register or adjacent stack slots. 971 972 if (src.first()->is_stack()) { 973 if (dst.first()->is_stack()) { 974 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 975 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 976 } else { 977 // stack to reg 978 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 979 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 980 } 981 } else if (dst.first()->is_stack()) { 982 // reg to stack 983 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 984 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister()); 985 } else { 986 // reg to reg 987 // In theory these overlap but the ordering is such that this is likely a nop 988 if ( src.first() != dst.first()) { 989 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 990 } 991 } 992 } 993 994 // On 64 bit we will store integer like items to the stack as 995 // 64 bits items (x86_32/64 abi) even though java would only store 996 // 32bits for a parameter. On 32bit it will simply be 32 bits 997 // So this routine will do 32->32 on 32bit and 32->64 on 64bit 998 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 999 if (src.first()->is_stack()) { 1000 if (dst.first()->is_stack()) { 1001 // stack to stack 1002 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 1003 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 1004 } else { 1005 // stack to reg 1006 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 1007 } 1008 } else if (dst.first()->is_stack()) { 1009 // reg to stack 1010 // Do we really have to sign extend??? 1011 // __ movslq(src.first()->as_Register(), src.first()->as_Register()); 1012 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register()); 1013 } else { 1014 // Do we really have to sign extend??? 1015 // __ movslq(dst.first()->as_Register(), src.first()->as_Register()); 1016 if (dst.first() != src.first()) { 1017 movq(dst.first()->as_Register(), src.first()->as_Register()); 1018 } 1019 } 1020 } 1021 1022 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) { 1023 if (src.first()->is_stack()) { 1024 if (dst.first()->is_stack()) { 1025 // stack to stack 1026 movq(rax, Address(rbp, reg2offset_in(src.first()))); 1027 movq(Address(rsp, reg2offset_out(dst.first())), rax); 1028 } else { 1029 // stack to reg 1030 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()))); 1031 } 1032 } else if (dst.first()->is_stack()) { 1033 // reg to stack 1034 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register()); 1035 } else { 1036 if (dst.first() != src.first()) { 1037 movq(dst.first()->as_Register(), src.first()->as_Register()); 1038 } 1039 } 1040 } 1041 1042 // An oop arg. Must pass a handle not the oop itself 1043 void MacroAssembler::object_move(OopMap* map, 1044 int oop_handle_offset, 1045 int framesize_in_slots, 1046 VMRegPair src, 1047 VMRegPair dst, 1048 bool is_receiver, 1049 int* receiver_offset) { 1050 1051 // must pass a handle. First figure out the location we use as a handle 1052 1053 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register(); 1054 1055 // See if oop is null if it is we need no handle 1056 1057 if (src.first()->is_stack()) { 1058 1059 // Oop is already on the stack as an argument 1060 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots(); 1061 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots)); 1062 if (is_receiver) { 1063 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size; 1064 } 1065 1066 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD); 1067 lea(rHandle, Address(rbp, reg2offset_in(src.first()))); 1068 // conditionally move a null 1069 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first()))); 1070 } else { 1071 1072 // Oop is in a register we must store it to the space we reserve 1073 // on the stack for oop_handles and pass a handle if oop is non-null 1074 1075 const Register rOop = src.first()->as_Register(); 1076 int oop_slot; 1077 if (rOop == j_rarg0) 1078 oop_slot = 0; 1079 else if (rOop == j_rarg1) 1080 oop_slot = 1; 1081 else if (rOop == j_rarg2) 1082 oop_slot = 2; 1083 else if (rOop == j_rarg3) 1084 oop_slot = 3; 1085 else if (rOop == j_rarg4) 1086 oop_slot = 4; 1087 else { 1088 assert(rOop == j_rarg5, "wrong register"); 1089 oop_slot = 5; 1090 } 1091 1092 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset; 1093 int offset = oop_slot*VMRegImpl::stack_slot_size; 1094 1095 map->set_oop(VMRegImpl::stack2reg(oop_slot)); 1096 // Store oop in handle area, may be null 1097 movptr(Address(rsp, offset), rOop); 1098 if (is_receiver) { 1099 *receiver_offset = offset; 1100 } 1101 1102 cmpptr(rOop, NULL_WORD); 1103 lea(rHandle, Address(rsp, offset)); 1104 // conditionally move a null from the handle area where it was just stored 1105 cmovptr(Assembler::equal, rHandle, Address(rsp, offset)); 1106 } 1107 1108 // If arg is on the stack then place it otherwise it is already in correct reg. 1109 if (dst.first()->is_stack()) { 1110 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle); 1111 } 1112 } 1113 1114 #endif // _LP64 1115 1116 // Now versions that are common to 32/64 bit 1117 1118 void MacroAssembler::addptr(Register dst, int32_t imm32) { 1119 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); 1120 } 1121 1122 void MacroAssembler::addptr(Register dst, Register src) { 1123 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1124 } 1125 1126 void MacroAssembler::addptr(Address dst, Register src) { 1127 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1128 } 1129 1130 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1131 assert(rscratch != noreg || always_reachable(src), "missing"); 1132 1133 if (reachable(src)) { 1134 Assembler::addsd(dst, as_Address(src)); 1135 } else { 1136 lea(rscratch, src); 1137 Assembler::addsd(dst, Address(rscratch, 0)); 1138 } 1139 } 1140 1141 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) { 1142 assert(rscratch != noreg || always_reachable(src), "missing"); 1143 1144 if (reachable(src)) { 1145 addss(dst, as_Address(src)); 1146 } else { 1147 lea(rscratch, src); 1148 addss(dst, Address(rscratch, 0)); 1149 } 1150 } 1151 1152 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1153 assert(rscratch != noreg || always_reachable(src), "missing"); 1154 1155 if (reachable(src)) { 1156 Assembler::addpd(dst, as_Address(src)); 1157 } else { 1158 lea(rscratch, src); 1159 Assembler::addpd(dst, Address(rscratch, 0)); 1160 } 1161 } 1162 1163 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only. 1164 // Stub code is generated once and never copied. 1165 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes. 1166 void MacroAssembler::align64() { 1167 align(64, (uint)(uintptr_t)pc()); 1168 } 1169 1170 void MacroAssembler::align32() { 1171 align(32, (uint)(uintptr_t)pc()); 1172 } 1173 1174 void MacroAssembler::align(uint modulus) { 1175 // 8273459: Ensure alignment is possible with current segment alignment 1176 assert(modulus <= (uintx)CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment"); 1177 align(modulus, offset()); 1178 } 1179 1180 void MacroAssembler::align(uint modulus, uint target) { 1181 if (target % modulus != 0) { 1182 nop(modulus - (target % modulus)); 1183 } 1184 } 1185 1186 void MacroAssembler::push_f(XMMRegister r) { 1187 subptr(rsp, wordSize); 1188 movflt(Address(rsp, 0), r); 1189 } 1190 1191 void MacroAssembler::pop_f(XMMRegister r) { 1192 movflt(r, Address(rsp, 0)); 1193 addptr(rsp, wordSize); 1194 } 1195 1196 void MacroAssembler::push_d(XMMRegister r) { 1197 subptr(rsp, 2 * wordSize); 1198 movdbl(Address(rsp, 0), r); 1199 } 1200 1201 void MacroAssembler::pop_d(XMMRegister r) { 1202 movdbl(r, Address(rsp, 0)); 1203 addptr(rsp, 2 * Interpreter::stackElementSize); 1204 } 1205 1206 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1207 // Used in sign-masking with aligned address. 1208 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1209 assert(rscratch != noreg || always_reachable(src), "missing"); 1210 1211 if (UseAVX > 2 && 1212 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 1213 (dst->encoding() >= 16)) { 1214 vpand(dst, dst, src, AVX_512bit, rscratch); 1215 } else if (reachable(src)) { 1216 Assembler::andpd(dst, as_Address(src)); 1217 } else { 1218 lea(rscratch, src); 1219 Assembler::andpd(dst, Address(rscratch, 0)); 1220 } 1221 } 1222 1223 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) { 1224 // Used in sign-masking with aligned address. 1225 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1226 assert(rscratch != noreg || always_reachable(src), "missing"); 1227 1228 if (reachable(src)) { 1229 Assembler::andps(dst, as_Address(src)); 1230 } else { 1231 lea(rscratch, src); 1232 Assembler::andps(dst, Address(rscratch, 0)); 1233 } 1234 } 1235 1236 void MacroAssembler::andptr(Register dst, int32_t imm32) { 1237 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); 1238 } 1239 1240 #ifdef _LP64 1241 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { 1242 assert(rscratch != noreg || always_reachable(src), "missing"); 1243 1244 if (reachable(src)) { 1245 andq(dst, as_Address(src)); 1246 } else { 1247 lea(rscratch, src); 1248 andq(dst, Address(rscratch, 0)); 1249 } 1250 } 1251 #endif 1252 1253 void MacroAssembler::atomic_incl(Address counter_addr) { 1254 lock(); 1255 incrementl(counter_addr); 1256 } 1257 1258 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) { 1259 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1260 1261 if (reachable(counter_addr)) { 1262 atomic_incl(as_Address(counter_addr)); 1263 } else { 1264 lea(rscratch, counter_addr); 1265 atomic_incl(Address(rscratch, 0)); 1266 } 1267 } 1268 1269 #ifdef _LP64 1270 void MacroAssembler::atomic_incq(Address counter_addr) { 1271 lock(); 1272 incrementq(counter_addr); 1273 } 1274 1275 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) { 1276 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1277 1278 if (reachable(counter_addr)) { 1279 atomic_incq(as_Address(counter_addr)); 1280 } else { 1281 lea(rscratch, counter_addr); 1282 atomic_incq(Address(rscratch, 0)); 1283 } 1284 } 1285 #endif 1286 1287 // Writes to stack successive pages until offset reached to check for 1288 // stack overflow + shadow pages. This clobbers tmp. 1289 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 1290 movptr(tmp, rsp); 1291 // Bang stack for total size given plus shadow page size. 1292 // Bang one page at a time because large size can bang beyond yellow and 1293 // red zones. 1294 Label loop; 1295 bind(loop); 1296 movl(Address(tmp, (-(int)os::vm_page_size())), size ); 1297 subptr(tmp, (int)os::vm_page_size()); 1298 subl(size, (int)os::vm_page_size()); 1299 jcc(Assembler::greater, loop); 1300 1301 // Bang down shadow pages too. 1302 // At this point, (tmp-0) is the last address touched, so don't 1303 // touch it again. (It was touched as (tmp-pagesize) but then tmp 1304 // was post-decremented.) Skip this address by starting at i=1, and 1305 // touch a few more pages below. N.B. It is important to touch all 1306 // the way down including all pages in the shadow zone. 1307 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) { 1308 // this could be any sized move but this is can be a debugging crumb 1309 // so the bigger the better. 1310 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size ); 1311 } 1312 } 1313 1314 void MacroAssembler::reserved_stack_check() { 1315 // testing if reserved zone needs to be enabled 1316 Label no_reserved_zone_enabling; 1317 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread); 1318 NOT_LP64(get_thread(rsi);) 1319 1320 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset())); 1321 jcc(Assembler::below, no_reserved_zone_enabling); 1322 1323 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread); 1324 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry())); 1325 should_not_reach_here(); 1326 1327 bind(no_reserved_zone_enabling); 1328 } 1329 1330 void MacroAssembler::c2bool(Register x) { 1331 // implements x == 0 ? 0 : 1 1332 // note: must only look at least-significant byte of x 1333 // since C-style booleans are stored in one byte 1334 // only! (was bug) 1335 andl(x, 0xFF); 1336 setb(Assembler::notZero, x); 1337 } 1338 1339 // Wouldn't need if AddressLiteral version had new name 1340 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 1341 Assembler::call(L, rtype); 1342 } 1343 1344 void MacroAssembler::call(Register entry) { 1345 Assembler::call(entry); 1346 } 1347 1348 void MacroAssembler::call(AddressLiteral entry, Register rscratch) { 1349 assert(rscratch != noreg || always_reachable(entry), "missing"); 1350 1351 if (reachable(entry)) { 1352 Assembler::call_literal(entry.target(), entry.rspec()); 1353 } else { 1354 lea(rscratch, entry); 1355 Assembler::call(rscratch); 1356 } 1357 } 1358 1359 void MacroAssembler::ic_call(address entry, jint method_index) { 1360 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 1361 #ifdef _LP64 1362 // Needs full 64-bit immediate for later patching. 1363 mov64(rax, (int64_t)Universe::non_oop_word()); 1364 #else 1365 movptr(rax, (intptr_t)Universe::non_oop_word()); 1366 #endif 1367 call(AddressLiteral(entry, rh)); 1368 } 1369 1370 int MacroAssembler::ic_check_size() { 1371 return 1372 LP64_ONLY(UseCompactObjectHeaders ? 17 : 14) NOT_LP64(12); 1373 } 1374 1375 int MacroAssembler::ic_check(int end_alignment) { 1376 Register receiver = LP64_ONLY(j_rarg0) NOT_LP64(rcx); 1377 Register data = rax; 1378 Register temp = LP64_ONLY(rscratch1) NOT_LP64(rbx); 1379 1380 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1381 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1382 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1383 // before the inline cache check here, and not after 1384 align(end_alignment, offset() + ic_check_size()); 1385 1386 int uep_offset = offset(); 1387 1388 #ifdef _LP64 1389 if (UseCompactObjectHeaders) { 1390 load_narrow_klass_compact(temp, receiver); 1391 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset())); 1392 } else 1393 #endif 1394 if (UseCompressedClassPointers) { 1395 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes())); 1396 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset())); 1397 } else { 1398 movptr(temp, Address(receiver, oopDesc::klass_offset_in_bytes())); 1399 cmpptr(temp, Address(data, CompiledICData::speculated_klass_offset())); 1400 } 1401 1402 // if inline cache check fails, then jump to runtime routine 1403 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1404 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment); 1405 1406 return uep_offset; 1407 } 1408 1409 void MacroAssembler::emit_static_call_stub() { 1410 // Static stub relocation also tags the Method* in the code-stream. 1411 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time. 1412 // This is recognized as unresolved by relocs/nativeinst/ic code. 1413 jump(RuntimeAddress(pc())); 1414 } 1415 1416 // Implementation of call_VM versions 1417 1418 void MacroAssembler::call_VM(Register oop_result, 1419 address entry_point, 1420 bool check_exceptions) { 1421 Label C, E; 1422 call(C, relocInfo::none); 1423 jmp(E); 1424 1425 bind(C); 1426 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 1427 ret(0); 1428 1429 bind(E); 1430 } 1431 1432 void MacroAssembler::call_VM(Register oop_result, 1433 address entry_point, 1434 Register arg_1, 1435 bool check_exceptions) { 1436 Label C, E; 1437 call(C, relocInfo::none); 1438 jmp(E); 1439 1440 bind(C); 1441 pass_arg1(this, arg_1); 1442 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 1443 ret(0); 1444 1445 bind(E); 1446 } 1447 1448 void MacroAssembler::call_VM(Register oop_result, 1449 address entry_point, 1450 Register arg_1, 1451 Register arg_2, 1452 bool check_exceptions) { 1453 Label C, E; 1454 call(C, relocInfo::none); 1455 jmp(E); 1456 1457 bind(C); 1458 1459 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1460 1461 pass_arg2(this, arg_2); 1462 pass_arg1(this, arg_1); 1463 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 1464 ret(0); 1465 1466 bind(E); 1467 } 1468 1469 void MacroAssembler::call_VM(Register oop_result, 1470 address entry_point, 1471 Register arg_1, 1472 Register arg_2, 1473 Register arg_3, 1474 bool check_exceptions) { 1475 Label C, E; 1476 call(C, relocInfo::none); 1477 jmp(E); 1478 1479 bind(C); 1480 1481 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1482 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1483 pass_arg3(this, arg_3); 1484 pass_arg2(this, arg_2); 1485 pass_arg1(this, arg_1); 1486 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 1487 ret(0); 1488 1489 bind(E); 1490 } 1491 1492 void MacroAssembler::call_VM(Register oop_result, 1493 Register last_java_sp, 1494 address entry_point, 1495 int number_of_arguments, 1496 bool check_exceptions) { 1497 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1498 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1499 } 1500 1501 void MacroAssembler::call_VM(Register oop_result, 1502 Register last_java_sp, 1503 address entry_point, 1504 Register arg_1, 1505 bool check_exceptions) { 1506 pass_arg1(this, arg_1); 1507 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1508 } 1509 1510 void MacroAssembler::call_VM(Register oop_result, 1511 Register last_java_sp, 1512 address entry_point, 1513 Register arg_1, 1514 Register arg_2, 1515 bool check_exceptions) { 1516 1517 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1518 pass_arg2(this, arg_2); 1519 pass_arg1(this, arg_1); 1520 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1521 } 1522 1523 void MacroAssembler::call_VM(Register oop_result, 1524 Register last_java_sp, 1525 address entry_point, 1526 Register arg_1, 1527 Register arg_2, 1528 Register arg_3, 1529 bool check_exceptions) { 1530 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1531 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1532 pass_arg3(this, arg_3); 1533 pass_arg2(this, arg_2); 1534 pass_arg1(this, arg_1); 1535 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1536 } 1537 1538 void MacroAssembler::super_call_VM(Register oop_result, 1539 Register last_java_sp, 1540 address entry_point, 1541 int number_of_arguments, 1542 bool check_exceptions) { 1543 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1544 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1545 } 1546 1547 void MacroAssembler::super_call_VM(Register oop_result, 1548 Register last_java_sp, 1549 address entry_point, 1550 Register arg_1, 1551 bool check_exceptions) { 1552 pass_arg1(this, arg_1); 1553 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1554 } 1555 1556 void MacroAssembler::super_call_VM(Register oop_result, 1557 Register last_java_sp, 1558 address entry_point, 1559 Register arg_1, 1560 Register arg_2, 1561 bool check_exceptions) { 1562 1563 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1564 pass_arg2(this, arg_2); 1565 pass_arg1(this, arg_1); 1566 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1567 } 1568 1569 void MacroAssembler::super_call_VM(Register oop_result, 1570 Register last_java_sp, 1571 address entry_point, 1572 Register arg_1, 1573 Register arg_2, 1574 Register arg_3, 1575 bool check_exceptions) { 1576 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1577 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1578 pass_arg3(this, arg_3); 1579 pass_arg2(this, arg_2); 1580 pass_arg1(this, arg_1); 1581 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1582 } 1583 1584 void MacroAssembler::call_VM_base(Register oop_result, 1585 Register java_thread, 1586 Register last_java_sp, 1587 address entry_point, 1588 int number_of_arguments, 1589 bool check_exceptions) { 1590 // determine java_thread register 1591 if (!java_thread->is_valid()) { 1592 #ifdef _LP64 1593 java_thread = r15_thread; 1594 #else 1595 java_thread = rdi; 1596 get_thread(java_thread); 1597 #endif // LP64 1598 } 1599 // determine last_java_sp register 1600 if (!last_java_sp->is_valid()) { 1601 last_java_sp = rsp; 1602 } 1603 // debugging support 1604 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 1605 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register")); 1606 #ifdef ASSERT 1607 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 1608 // r12 is the heapbase. 1609 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");) 1610 #endif // ASSERT 1611 1612 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 1613 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 1614 1615 // push java thread (becomes first argument of C function) 1616 1617 NOT_LP64(push(java_thread); number_of_arguments++); 1618 LP64_ONLY(mov(c_rarg0, r15_thread)); 1619 1620 // set last Java frame before call 1621 assert(last_java_sp != rbp, "can't use ebp/rbp"); 1622 1623 // Only interpreter should have to set fp 1624 set_last_Java_frame(java_thread, last_java_sp, rbp, nullptr, rscratch1); 1625 1626 // do the call, remove parameters 1627 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); 1628 1629 // restore the thread (cannot use the pushed argument since arguments 1630 // may be overwritten by C code generated by an optimizing compiler); 1631 // however can use the register value directly if it is callee saved. 1632 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { 1633 // rdi & rsi (also r15) are callee saved -> nothing to do 1634 #ifdef ASSERT 1635 guarantee(java_thread != rax, "change this code"); 1636 push(rax); 1637 { Label L; 1638 get_thread(rax); 1639 cmpptr(java_thread, rax); 1640 jcc(Assembler::equal, L); 1641 STOP("MacroAssembler::call_VM_base: rdi not callee saved?"); 1642 bind(L); 1643 } 1644 pop(rax); 1645 #endif 1646 } else { 1647 get_thread(java_thread); 1648 } 1649 // reset last Java frame 1650 // Only interpreter should have to clear fp 1651 reset_last_Java_frame(java_thread, true); 1652 1653 // C++ interp handles this in the interpreter 1654 check_and_handle_popframe(java_thread); 1655 check_and_handle_earlyret(java_thread); 1656 1657 if (check_exceptions) { 1658 // check for pending exceptions (java_thread is set upon return) 1659 cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD); 1660 #ifndef _LP64 1661 jump_cc(Assembler::notEqual, 1662 RuntimeAddress(StubRoutines::forward_exception_entry())); 1663 #else 1664 // This used to conditionally jump to forward_exception however it is 1665 // possible if we relocate that the branch will not reach. So we must jump 1666 // around so we can always reach 1667 1668 Label ok; 1669 jcc(Assembler::equal, ok); 1670 jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1671 bind(ok); 1672 #endif // LP64 1673 } 1674 1675 // get oop result if there is one and reset the value in the thread 1676 if (oop_result->is_valid()) { 1677 get_vm_result(oop_result, java_thread); 1678 } 1679 } 1680 1681 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 1682 1683 // Calculate the value for last_Java_sp 1684 // somewhat subtle. call_VM does an intermediate call 1685 // which places a return address on the stack just under the 1686 // stack pointer as the user finished with it. This allows 1687 // use to retrieve last_Java_pc from last_Java_sp[-1]. 1688 // On 32bit we then have to push additional args on the stack to accomplish 1689 // the actual requested call. On 64bit call_VM only can use register args 1690 // so the only extra space is the return address that call_VM created. 1691 // This hopefully explains the calculations here. 1692 1693 #ifdef _LP64 1694 // We've pushed one address, correct last_Java_sp 1695 lea(rax, Address(rsp, wordSize)); 1696 #else 1697 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize)); 1698 #endif // LP64 1699 1700 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions); 1701 1702 } 1703 1704 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. 1705 void MacroAssembler::call_VM_leaf0(address entry_point) { 1706 MacroAssembler::call_VM_leaf_base(entry_point, 0); 1707 } 1708 1709 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1710 call_VM_leaf_base(entry_point, number_of_arguments); 1711 } 1712 1713 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1714 pass_arg0(this, arg_0); 1715 call_VM_leaf(entry_point, 1); 1716 } 1717 1718 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1719 1720 LP64_ONLY(assert_different_registers(arg_0, c_rarg1)); 1721 pass_arg1(this, arg_1); 1722 pass_arg0(this, arg_0); 1723 call_VM_leaf(entry_point, 2); 1724 } 1725 1726 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1727 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2)); 1728 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1729 pass_arg2(this, arg_2); 1730 pass_arg1(this, arg_1); 1731 pass_arg0(this, arg_0); 1732 call_VM_leaf(entry_point, 3); 1733 } 1734 1735 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1736 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3)); 1737 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1738 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1739 pass_arg3(this, arg_3); 1740 pass_arg2(this, arg_2); 1741 pass_arg1(this, arg_1); 1742 pass_arg0(this, arg_0); 1743 call_VM_leaf(entry_point, 3); 1744 } 1745 1746 void MacroAssembler::super_call_VM_leaf(address entry_point) { 1747 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1748 } 1749 1750 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1751 pass_arg0(this, arg_0); 1752 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1753 } 1754 1755 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1756 LP64_ONLY(assert_different_registers(arg_0, c_rarg1)); 1757 pass_arg1(this, arg_1); 1758 pass_arg0(this, arg_0); 1759 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1760 } 1761 1762 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1763 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2)); 1764 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1765 pass_arg2(this, arg_2); 1766 pass_arg1(this, arg_1); 1767 pass_arg0(this, arg_0); 1768 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1769 } 1770 1771 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1772 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3)); 1773 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1774 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1775 pass_arg3(this, arg_3); 1776 pass_arg2(this, arg_2); 1777 pass_arg1(this, arg_1); 1778 pass_arg0(this, arg_0); 1779 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1780 } 1781 1782 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 1783 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 1784 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD); 1785 verify_oop_msg(oop_result, "broken oop in call_VM_base"); 1786 } 1787 1788 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 1789 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 1790 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD); 1791 } 1792 1793 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { 1794 } 1795 1796 void MacroAssembler::check_and_handle_popframe(Register java_thread) { 1797 } 1798 1799 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) { 1800 assert(rscratch != noreg || always_reachable(src1), "missing"); 1801 1802 if (reachable(src1)) { 1803 cmpl(as_Address(src1), imm); 1804 } else { 1805 lea(rscratch, src1); 1806 cmpl(Address(rscratch, 0), imm); 1807 } 1808 } 1809 1810 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) { 1811 assert(!src2.is_lval(), "use cmpptr"); 1812 assert(rscratch != noreg || always_reachable(src2), "missing"); 1813 1814 if (reachable(src2)) { 1815 cmpl(src1, as_Address(src2)); 1816 } else { 1817 lea(rscratch, src2); 1818 cmpl(src1, Address(rscratch, 0)); 1819 } 1820 } 1821 1822 void MacroAssembler::cmp32(Register src1, int32_t imm) { 1823 Assembler::cmpl(src1, imm); 1824 } 1825 1826 void MacroAssembler::cmp32(Register src1, Address src2) { 1827 Assembler::cmpl(src1, src2); 1828 } 1829 1830 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1831 ucomisd(opr1, opr2); 1832 1833 Label L; 1834 if (unordered_is_less) { 1835 movl(dst, -1); 1836 jcc(Assembler::parity, L); 1837 jcc(Assembler::below , L); 1838 movl(dst, 0); 1839 jcc(Assembler::equal , L); 1840 increment(dst); 1841 } else { // unordered is greater 1842 movl(dst, 1); 1843 jcc(Assembler::parity, L); 1844 jcc(Assembler::above , L); 1845 movl(dst, 0); 1846 jcc(Assembler::equal , L); 1847 decrementl(dst); 1848 } 1849 bind(L); 1850 } 1851 1852 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1853 ucomiss(opr1, opr2); 1854 1855 Label L; 1856 if (unordered_is_less) { 1857 movl(dst, -1); 1858 jcc(Assembler::parity, L); 1859 jcc(Assembler::below , L); 1860 movl(dst, 0); 1861 jcc(Assembler::equal , L); 1862 increment(dst); 1863 } else { // unordered is greater 1864 movl(dst, 1); 1865 jcc(Assembler::parity, L); 1866 jcc(Assembler::above , L); 1867 movl(dst, 0); 1868 jcc(Assembler::equal , L); 1869 decrementl(dst); 1870 } 1871 bind(L); 1872 } 1873 1874 1875 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) { 1876 assert(rscratch != noreg || always_reachable(src1), "missing"); 1877 1878 if (reachable(src1)) { 1879 cmpb(as_Address(src1), imm); 1880 } else { 1881 lea(rscratch, src1); 1882 cmpb(Address(rscratch, 0), imm); 1883 } 1884 } 1885 1886 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) { 1887 #ifdef _LP64 1888 assert(rscratch != noreg || always_reachable(src2), "missing"); 1889 1890 if (src2.is_lval()) { 1891 movptr(rscratch, src2); 1892 Assembler::cmpq(src1, rscratch); 1893 } else if (reachable(src2)) { 1894 cmpq(src1, as_Address(src2)); 1895 } else { 1896 lea(rscratch, src2); 1897 Assembler::cmpq(src1, Address(rscratch, 0)); 1898 } 1899 #else 1900 assert(rscratch == noreg, "not needed"); 1901 if (src2.is_lval()) { 1902 cmp_literal32(src1, (int32_t)src2.target(), src2.rspec()); 1903 } else { 1904 cmpl(src1, as_Address(src2)); 1905 } 1906 #endif // _LP64 1907 } 1908 1909 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) { 1910 assert(src2.is_lval(), "not a mem-mem compare"); 1911 #ifdef _LP64 1912 // moves src2's literal address 1913 movptr(rscratch, src2); 1914 Assembler::cmpq(src1, rscratch); 1915 #else 1916 assert(rscratch == noreg, "not needed"); 1917 cmp_literal32(src1, (int32_t)src2.target(), src2.rspec()); 1918 #endif // _LP64 1919 } 1920 1921 void MacroAssembler::cmpoop(Register src1, Register src2) { 1922 cmpptr(src1, src2); 1923 } 1924 1925 void MacroAssembler::cmpoop(Register src1, Address src2) { 1926 cmpptr(src1, src2); 1927 } 1928 1929 #ifdef _LP64 1930 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) { 1931 movoop(rscratch, src2); 1932 cmpptr(src1, rscratch); 1933 } 1934 #endif 1935 1936 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) { 1937 assert(rscratch != noreg || always_reachable(adr), "missing"); 1938 1939 if (reachable(adr)) { 1940 lock(); 1941 cmpxchgptr(reg, as_Address(adr)); 1942 } else { 1943 lea(rscratch, adr); 1944 lock(); 1945 cmpxchgptr(reg, Address(rscratch, 0)); 1946 } 1947 } 1948 1949 void MacroAssembler::cmpxchgptr(Register reg, Address adr) { 1950 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); 1951 } 1952 1953 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1954 assert(rscratch != noreg || always_reachable(src), "missing"); 1955 1956 if (reachable(src)) { 1957 Assembler::comisd(dst, as_Address(src)); 1958 } else { 1959 lea(rscratch, src); 1960 Assembler::comisd(dst, Address(rscratch, 0)); 1961 } 1962 } 1963 1964 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) { 1965 assert(rscratch != noreg || always_reachable(src), "missing"); 1966 1967 if (reachable(src)) { 1968 Assembler::comiss(dst, as_Address(src)); 1969 } else { 1970 lea(rscratch, src); 1971 Assembler::comiss(dst, Address(rscratch, 0)); 1972 } 1973 } 1974 1975 1976 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) { 1977 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1978 1979 Condition negated_cond = negate_condition(cond); 1980 Label L; 1981 jcc(negated_cond, L); 1982 pushf(); // Preserve flags 1983 atomic_incl(counter_addr, rscratch); 1984 popf(); 1985 bind(L); 1986 } 1987 1988 int MacroAssembler::corrected_idivl(Register reg) { 1989 // Full implementation of Java idiv and irem; checks for 1990 // special case as described in JVM spec., p.243 & p.271. 1991 // The function returns the (pc) offset of the idivl 1992 // instruction - may be needed for implicit exceptions. 1993 // 1994 // normal case special case 1995 // 1996 // input : rax,: dividend min_int 1997 // reg: divisor (may not be rax,/rdx) -1 1998 // 1999 // output: rax,: quotient (= rax, idiv reg) min_int 2000 // rdx: remainder (= rax, irem reg) 0 2001 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); 2002 const int min_int = 0x80000000; 2003 Label normal_case, special_case; 2004 2005 // check for special case 2006 cmpl(rax, min_int); 2007 jcc(Assembler::notEqual, normal_case); 2008 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) 2009 cmpl(reg, -1); 2010 jcc(Assembler::equal, special_case); 2011 2012 // handle normal case 2013 bind(normal_case); 2014 cdql(); 2015 int idivl_offset = offset(); 2016 idivl(reg); 2017 2018 // normal and special case exit 2019 bind(special_case); 2020 2021 return idivl_offset; 2022 } 2023 2024 2025 2026 void MacroAssembler::decrementl(Register reg, int value) { 2027 if (value == min_jint) {subl(reg, value) ; return; } 2028 if (value < 0) { incrementl(reg, -value); return; } 2029 if (value == 0) { ; return; } 2030 if (value == 1 && UseIncDec) { decl(reg) ; return; } 2031 /* else */ { subl(reg, value) ; return; } 2032 } 2033 2034 void MacroAssembler::decrementl(Address dst, int value) { 2035 if (value == min_jint) {subl(dst, value) ; return; } 2036 if (value < 0) { incrementl(dst, -value); return; } 2037 if (value == 0) { ; return; } 2038 if (value == 1 && UseIncDec) { decl(dst) ; return; } 2039 /* else */ { subl(dst, value) ; return; } 2040 } 2041 2042 void MacroAssembler::division_with_shift (Register reg, int shift_value) { 2043 assert(shift_value > 0, "illegal shift value"); 2044 Label _is_positive; 2045 testl (reg, reg); 2046 jcc (Assembler::positive, _is_positive); 2047 int offset = (1 << shift_value) - 1 ; 2048 2049 if (offset == 1) { 2050 incrementl(reg); 2051 } else { 2052 addl(reg, offset); 2053 } 2054 2055 bind (_is_positive); 2056 sarl(reg, shift_value); 2057 } 2058 2059 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2060 assert(rscratch != noreg || always_reachable(src), "missing"); 2061 2062 if (reachable(src)) { 2063 Assembler::divsd(dst, as_Address(src)); 2064 } else { 2065 lea(rscratch, src); 2066 Assembler::divsd(dst, Address(rscratch, 0)); 2067 } 2068 } 2069 2070 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2071 assert(rscratch != noreg || always_reachable(src), "missing"); 2072 2073 if (reachable(src)) { 2074 Assembler::divss(dst, as_Address(src)); 2075 } else { 2076 lea(rscratch, src); 2077 Assembler::divss(dst, Address(rscratch, 0)); 2078 } 2079 } 2080 2081 void MacroAssembler::enter() { 2082 push(rbp); 2083 mov(rbp, rsp); 2084 } 2085 2086 void MacroAssembler::post_call_nop() { 2087 if (!Continuations::enabled()) { 2088 return; 2089 } 2090 InstructionMark im(this); 2091 relocate(post_call_nop_Relocation::spec()); 2092 InlineSkippedInstructionsCounter skipCounter(this); 2093 emit_int8((uint8_t)0x0f); 2094 emit_int8((uint8_t)0x1f); 2095 emit_int8((uint8_t)0x84); 2096 emit_int8((uint8_t)0x00); 2097 emit_int32(0x00); 2098 } 2099 2100 // A 5 byte nop that is safe for patching (see patch_verified_entry) 2101 void MacroAssembler::fat_nop() { 2102 if (UseAddressNop) { 2103 addr_nop_5(); 2104 } else { 2105 emit_int8((uint8_t)0x26); // es: 2106 emit_int8((uint8_t)0x2e); // cs: 2107 emit_int8((uint8_t)0x64); // fs: 2108 emit_int8((uint8_t)0x65); // gs: 2109 emit_int8((uint8_t)0x90); 2110 } 2111 } 2112 2113 #ifndef _LP64 2114 void MacroAssembler::fcmp(Register tmp) { 2115 fcmp(tmp, 1, true, true); 2116 } 2117 2118 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { 2119 assert(!pop_right || pop_left, "usage error"); 2120 if (VM_Version::supports_cmov()) { 2121 assert(tmp == noreg, "unneeded temp"); 2122 if (pop_left) { 2123 fucomip(index); 2124 } else { 2125 fucomi(index); 2126 } 2127 if (pop_right) { 2128 fpop(); 2129 } 2130 } else { 2131 assert(tmp != noreg, "need temp"); 2132 if (pop_left) { 2133 if (pop_right) { 2134 fcompp(); 2135 } else { 2136 fcomp(index); 2137 } 2138 } else { 2139 fcom(index); 2140 } 2141 // convert FPU condition into eflags condition via rax, 2142 save_rax(tmp); 2143 fwait(); fnstsw_ax(); 2144 sahf(); 2145 restore_rax(tmp); 2146 } 2147 // condition codes set as follows: 2148 // 2149 // CF (corresponds to C0) if x < y 2150 // PF (corresponds to C2) if unordered 2151 // ZF (corresponds to C3) if x = y 2152 } 2153 2154 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { 2155 fcmp2int(dst, unordered_is_less, 1, true, true); 2156 } 2157 2158 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { 2159 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); 2160 Label L; 2161 if (unordered_is_less) { 2162 movl(dst, -1); 2163 jcc(Assembler::parity, L); 2164 jcc(Assembler::below , L); 2165 movl(dst, 0); 2166 jcc(Assembler::equal , L); 2167 increment(dst); 2168 } else { // unordered is greater 2169 movl(dst, 1); 2170 jcc(Assembler::parity, L); 2171 jcc(Assembler::above , L); 2172 movl(dst, 0); 2173 jcc(Assembler::equal , L); 2174 decrementl(dst); 2175 } 2176 bind(L); 2177 } 2178 2179 void MacroAssembler::fld_d(AddressLiteral src) { 2180 fld_d(as_Address(src)); 2181 } 2182 2183 void MacroAssembler::fld_s(AddressLiteral src) { 2184 fld_s(as_Address(src)); 2185 } 2186 2187 void MacroAssembler::fldcw(AddressLiteral src) { 2188 fldcw(as_Address(src)); 2189 } 2190 2191 void MacroAssembler::fpop() { 2192 ffree(); 2193 fincstp(); 2194 } 2195 2196 void MacroAssembler::fremr(Register tmp) { 2197 save_rax(tmp); 2198 { Label L; 2199 bind(L); 2200 fprem(); 2201 fwait(); fnstsw_ax(); 2202 sahf(); 2203 jcc(Assembler::parity, L); 2204 } 2205 restore_rax(tmp); 2206 // Result is in ST0. 2207 // Note: fxch & fpop to get rid of ST1 2208 // (otherwise FPU stack could overflow eventually) 2209 fxch(1); 2210 fpop(); 2211 } 2212 2213 void MacroAssembler::empty_FPU_stack() { 2214 if (VM_Version::supports_mmx()) { 2215 emms(); 2216 } else { 2217 for (int i = 8; i-- > 0; ) ffree(i); 2218 } 2219 } 2220 #endif // !LP64 2221 2222 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2223 assert(rscratch != noreg || always_reachable(src), "missing"); 2224 if (reachable(src)) { 2225 Assembler::mulpd(dst, as_Address(src)); 2226 } else { 2227 lea(rscratch, src); 2228 Assembler::mulpd(dst, Address(rscratch, 0)); 2229 } 2230 } 2231 2232 void MacroAssembler::load_float(Address src) { 2233 #ifdef _LP64 2234 movflt(xmm0, src); 2235 #else 2236 if (UseSSE >= 1) { 2237 movflt(xmm0, src); 2238 } else { 2239 fld_s(src); 2240 } 2241 #endif // LP64 2242 } 2243 2244 void MacroAssembler::store_float(Address dst) { 2245 #ifdef _LP64 2246 movflt(dst, xmm0); 2247 #else 2248 if (UseSSE >= 1) { 2249 movflt(dst, xmm0); 2250 } else { 2251 fstp_s(dst); 2252 } 2253 #endif // LP64 2254 } 2255 2256 void MacroAssembler::load_double(Address src) { 2257 #ifdef _LP64 2258 movdbl(xmm0, src); 2259 #else 2260 if (UseSSE >= 2) { 2261 movdbl(xmm0, src); 2262 } else { 2263 fld_d(src); 2264 } 2265 #endif // LP64 2266 } 2267 2268 void MacroAssembler::store_double(Address dst) { 2269 #ifdef _LP64 2270 movdbl(dst, xmm0); 2271 #else 2272 if (UseSSE >= 2) { 2273 movdbl(dst, xmm0); 2274 } else { 2275 fstp_d(dst); 2276 } 2277 #endif // LP64 2278 } 2279 2280 // dst = c = a * b + c 2281 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2282 Assembler::vfmadd231sd(c, a, b); 2283 if (dst != c) { 2284 movdbl(dst, c); 2285 } 2286 } 2287 2288 // dst = c = a * b + c 2289 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2290 Assembler::vfmadd231ss(c, a, b); 2291 if (dst != c) { 2292 movflt(dst, c); 2293 } 2294 } 2295 2296 // dst = c = a * b + c 2297 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2298 Assembler::vfmadd231pd(c, a, b, vector_len); 2299 if (dst != c) { 2300 vmovdqu(dst, c); 2301 } 2302 } 2303 2304 // dst = c = a * b + c 2305 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2306 Assembler::vfmadd231ps(c, a, b, vector_len); 2307 if (dst != c) { 2308 vmovdqu(dst, c); 2309 } 2310 } 2311 2312 // dst = c = a * b + c 2313 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2314 Assembler::vfmadd231pd(c, a, b, vector_len); 2315 if (dst != c) { 2316 vmovdqu(dst, c); 2317 } 2318 } 2319 2320 // dst = c = a * b + c 2321 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2322 Assembler::vfmadd231ps(c, a, b, vector_len); 2323 if (dst != c) { 2324 vmovdqu(dst, c); 2325 } 2326 } 2327 2328 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) { 2329 assert(rscratch != noreg || always_reachable(dst), "missing"); 2330 2331 if (reachable(dst)) { 2332 incrementl(as_Address(dst)); 2333 } else { 2334 lea(rscratch, dst); 2335 incrementl(Address(rscratch, 0)); 2336 } 2337 } 2338 2339 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) { 2340 incrementl(as_Address(dst, rscratch)); 2341 } 2342 2343 void MacroAssembler::incrementl(Register reg, int value) { 2344 if (value == min_jint) {addl(reg, value) ; return; } 2345 if (value < 0) { decrementl(reg, -value); return; } 2346 if (value == 0) { ; return; } 2347 if (value == 1 && UseIncDec) { incl(reg) ; return; } 2348 /* else */ { addl(reg, value) ; return; } 2349 } 2350 2351 void MacroAssembler::incrementl(Address dst, int value) { 2352 if (value == min_jint) {addl(dst, value) ; return; } 2353 if (value < 0) { decrementl(dst, -value); return; } 2354 if (value == 0) { ; return; } 2355 if (value == 1 && UseIncDec) { incl(dst) ; return; } 2356 /* else */ { addl(dst, value) ; return; } 2357 } 2358 2359 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) { 2360 assert(rscratch != noreg || always_reachable(dst), "missing"); 2361 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump"); 2362 if (reachable(dst)) { 2363 jmp_literal(dst.target(), dst.rspec()); 2364 } else { 2365 lea(rscratch, dst); 2366 jmp(rscratch); 2367 } 2368 } 2369 2370 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) { 2371 assert(rscratch != noreg || always_reachable(dst), "missing"); 2372 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc"); 2373 if (reachable(dst)) { 2374 InstructionMark im(this); 2375 relocate(dst.reloc()); 2376 const int short_size = 2; 2377 const int long_size = 6; 2378 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 2379 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 2380 // 0111 tttn #8-bit disp 2381 emit_int8(0x70 | cc); 2382 emit_int8((offs - short_size) & 0xFF); 2383 } else { 2384 // 0000 1111 1000 tttn #32-bit disp 2385 emit_int8(0x0F); 2386 emit_int8((unsigned char)(0x80 | cc)); 2387 emit_int32(offs - long_size); 2388 } 2389 } else { 2390 #ifdef ASSERT 2391 warning("reversing conditional branch"); 2392 #endif /* ASSERT */ 2393 Label skip; 2394 jccb(reverse[cc], skip); 2395 lea(rscratch, dst); 2396 Assembler::jmp(rscratch); 2397 bind(skip); 2398 } 2399 } 2400 2401 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) { 2402 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); 2403 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing"); 2404 2405 stmxcsr(mxcsr_save); 2406 movl(tmp, mxcsr_save); 2407 if (EnableX86ECoreOpts) { 2408 // The mxcsr_std has status bits set for performance on ECore 2409 orl(tmp, 0x003f); 2410 } else { 2411 // Mask out status bits (only check control and mask bits) 2412 andl(tmp, 0xFFC0); 2413 } 2414 cmp32(tmp, mxcsr_std, rscratch); 2415 } 2416 2417 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) { 2418 assert(rscratch != noreg || always_reachable(src), "missing"); 2419 2420 if (reachable(src)) { 2421 Assembler::ldmxcsr(as_Address(src)); 2422 } else { 2423 lea(rscratch, src); 2424 Assembler::ldmxcsr(Address(rscratch, 0)); 2425 } 2426 } 2427 2428 int MacroAssembler::load_signed_byte(Register dst, Address src) { 2429 int off; 2430 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2431 off = offset(); 2432 movsbl(dst, src); // movsxb 2433 } else { 2434 off = load_unsigned_byte(dst, src); 2435 shll(dst, 24); 2436 sarl(dst, 24); 2437 } 2438 return off; 2439 } 2440 2441 // Note: load_signed_short used to be called load_signed_word. 2442 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler 2443 // manual, which means 16 bits, that usage is found nowhere in HotSpot code. 2444 // The term "word" in HotSpot means a 32- or 64-bit machine word. 2445 int MacroAssembler::load_signed_short(Register dst, Address src) { 2446 int off; 2447 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2448 // This is dubious to me since it seems safe to do a signed 16 => 64 bit 2449 // version but this is what 64bit has always done. This seems to imply 2450 // that users are only using 32bits worth. 2451 off = offset(); 2452 movswl(dst, src); // movsxw 2453 } else { 2454 off = load_unsigned_short(dst, src); 2455 shll(dst, 16); 2456 sarl(dst, 16); 2457 } 2458 return off; 2459 } 2460 2461 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 2462 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2463 // and "3.9 Partial Register Penalties", p. 22). 2464 int off; 2465 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { 2466 off = offset(); 2467 movzbl(dst, src); // movzxb 2468 } else { 2469 xorl(dst, dst); 2470 off = offset(); 2471 movb(dst, src); 2472 } 2473 return off; 2474 } 2475 2476 // Note: load_unsigned_short used to be called load_unsigned_word. 2477 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 2478 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2479 // and "3.9 Partial Register Penalties", p. 22). 2480 int off; 2481 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { 2482 off = offset(); 2483 movzwl(dst, src); // movzxw 2484 } else { 2485 xorl(dst, dst); 2486 off = offset(); 2487 movw(dst, src); 2488 } 2489 return off; 2490 } 2491 2492 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 2493 switch (size_in_bytes) { 2494 #ifndef _LP64 2495 case 8: 2496 assert(dst2 != noreg, "second dest register required"); 2497 movl(dst, src); 2498 movl(dst2, src.plus_disp(BytesPerInt)); 2499 break; 2500 #else 2501 case 8: movq(dst, src); break; 2502 #endif 2503 case 4: movl(dst, src); break; 2504 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 2505 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 2506 default: ShouldNotReachHere(); 2507 } 2508 } 2509 2510 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 2511 switch (size_in_bytes) { 2512 #ifndef _LP64 2513 case 8: 2514 assert(src2 != noreg, "second source register required"); 2515 movl(dst, src); 2516 movl(dst.plus_disp(BytesPerInt), src2); 2517 break; 2518 #else 2519 case 8: movq(dst, src); break; 2520 #endif 2521 case 4: movl(dst, src); break; 2522 case 2: movw(dst, src); break; 2523 case 1: movb(dst, src); break; 2524 default: ShouldNotReachHere(); 2525 } 2526 } 2527 2528 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) { 2529 assert(rscratch != noreg || always_reachable(dst), "missing"); 2530 2531 if (reachable(dst)) { 2532 movl(as_Address(dst), src); 2533 } else { 2534 lea(rscratch, dst); 2535 movl(Address(rscratch, 0), src); 2536 } 2537 } 2538 2539 void MacroAssembler::mov32(Register dst, AddressLiteral src) { 2540 if (reachable(src)) { 2541 movl(dst, as_Address(src)); 2542 } else { 2543 lea(dst, src); 2544 movl(dst, Address(dst, 0)); 2545 } 2546 } 2547 2548 // C++ bool manipulation 2549 2550 void MacroAssembler::movbool(Register dst, Address src) { 2551 if(sizeof(bool) == 1) 2552 movb(dst, src); 2553 else if(sizeof(bool) == 2) 2554 movw(dst, src); 2555 else if(sizeof(bool) == 4) 2556 movl(dst, src); 2557 else 2558 // unsupported 2559 ShouldNotReachHere(); 2560 } 2561 2562 void MacroAssembler::movbool(Address dst, bool boolconst) { 2563 if(sizeof(bool) == 1) 2564 movb(dst, (int) boolconst); 2565 else if(sizeof(bool) == 2) 2566 movw(dst, (int) boolconst); 2567 else if(sizeof(bool) == 4) 2568 movl(dst, (int) boolconst); 2569 else 2570 // unsupported 2571 ShouldNotReachHere(); 2572 } 2573 2574 void MacroAssembler::movbool(Address dst, Register src) { 2575 if(sizeof(bool) == 1) 2576 movb(dst, src); 2577 else if(sizeof(bool) == 2) 2578 movw(dst, src); 2579 else if(sizeof(bool) == 4) 2580 movl(dst, src); 2581 else 2582 // unsupported 2583 ShouldNotReachHere(); 2584 } 2585 2586 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) { 2587 assert(rscratch != noreg || always_reachable(src), "missing"); 2588 2589 if (reachable(src)) { 2590 movdl(dst, as_Address(src)); 2591 } else { 2592 lea(rscratch, src); 2593 movdl(dst, Address(rscratch, 0)); 2594 } 2595 } 2596 2597 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) { 2598 assert(rscratch != noreg || always_reachable(src), "missing"); 2599 2600 if (reachable(src)) { 2601 movq(dst, as_Address(src)); 2602 } else { 2603 lea(rscratch, src); 2604 movq(dst, Address(rscratch, 0)); 2605 } 2606 } 2607 2608 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) { 2609 assert(rscratch != noreg || always_reachable(src), "missing"); 2610 2611 if (reachable(src)) { 2612 if (UseXmmLoadAndClearUpper) { 2613 movsd (dst, as_Address(src)); 2614 } else { 2615 movlpd(dst, as_Address(src)); 2616 } 2617 } else { 2618 lea(rscratch, src); 2619 if (UseXmmLoadAndClearUpper) { 2620 movsd (dst, Address(rscratch, 0)); 2621 } else { 2622 movlpd(dst, Address(rscratch, 0)); 2623 } 2624 } 2625 } 2626 2627 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) { 2628 assert(rscratch != noreg || always_reachable(src), "missing"); 2629 2630 if (reachable(src)) { 2631 movss(dst, as_Address(src)); 2632 } else { 2633 lea(rscratch, src); 2634 movss(dst, Address(rscratch, 0)); 2635 } 2636 } 2637 2638 void MacroAssembler::movptr(Register dst, Register src) { 2639 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2640 } 2641 2642 void MacroAssembler::movptr(Register dst, Address src) { 2643 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2644 } 2645 2646 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 2647 void MacroAssembler::movptr(Register dst, intptr_t src) { 2648 #ifdef _LP64 2649 if (is_uimm32(src)) { 2650 movl(dst, checked_cast<uint32_t>(src)); 2651 } else if (is_simm32(src)) { 2652 movq(dst, checked_cast<int32_t>(src)); 2653 } else { 2654 mov64(dst, src); 2655 } 2656 #else 2657 movl(dst, src); 2658 #endif 2659 } 2660 2661 void MacroAssembler::movptr(Address dst, Register src) { 2662 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2663 } 2664 2665 void MacroAssembler::movptr(Address dst, int32_t src) { 2666 LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); 2667 } 2668 2669 void MacroAssembler::movdqu(Address dst, XMMRegister src) { 2670 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2671 Assembler::movdqu(dst, src); 2672 } 2673 2674 void MacroAssembler::movdqu(XMMRegister dst, Address src) { 2675 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2676 Assembler::movdqu(dst, src); 2677 } 2678 2679 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { 2680 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2681 Assembler::movdqu(dst, src); 2682 } 2683 2684 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) { 2685 assert(rscratch != noreg || always_reachable(src), "missing"); 2686 2687 if (reachable(src)) { 2688 movdqu(dst, as_Address(src)); 2689 } else { 2690 lea(rscratch, src); 2691 movdqu(dst, Address(rscratch, 0)); 2692 } 2693 } 2694 2695 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) { 2696 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2697 Assembler::vmovdqu(dst, src); 2698 } 2699 2700 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) { 2701 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2702 Assembler::vmovdqu(dst, src); 2703 } 2704 2705 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { 2706 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2707 Assembler::vmovdqu(dst, src); 2708 } 2709 2710 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) { 2711 assert(rscratch != noreg || always_reachable(src), "missing"); 2712 2713 if (reachable(src)) { 2714 vmovdqu(dst, as_Address(src)); 2715 } 2716 else { 2717 lea(rscratch, src); 2718 vmovdqu(dst, Address(rscratch, 0)); 2719 } 2720 } 2721 2722 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2723 assert(rscratch != noreg || always_reachable(src), "missing"); 2724 2725 if (vector_len == AVX_512bit) { 2726 evmovdquq(dst, src, AVX_512bit, rscratch); 2727 } else if (vector_len == AVX_256bit) { 2728 vmovdqu(dst, src, rscratch); 2729 } else { 2730 movdqu(dst, src, rscratch); 2731 } 2732 } 2733 2734 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) { 2735 if (vector_len == AVX_512bit) { 2736 evmovdquq(dst, src, AVX_512bit); 2737 } else if (vector_len == AVX_256bit) { 2738 vmovdqu(dst, src); 2739 } else { 2740 movdqu(dst, src); 2741 } 2742 } 2743 2744 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) { 2745 if (vector_len == AVX_512bit) { 2746 evmovdquq(dst, src, AVX_512bit); 2747 } else if (vector_len == AVX_256bit) { 2748 vmovdqu(dst, src); 2749 } else { 2750 movdqu(dst, src); 2751 } 2752 } 2753 2754 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) { 2755 if (vector_len == AVX_512bit) { 2756 evmovdquq(dst, src, AVX_512bit); 2757 } else if (vector_len == AVX_256bit) { 2758 vmovdqu(dst, src); 2759 } else { 2760 movdqu(dst, src); 2761 } 2762 } 2763 2764 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) { 2765 assert(rscratch != noreg || always_reachable(src), "missing"); 2766 2767 if (reachable(src)) { 2768 vmovdqa(dst, as_Address(src)); 2769 } 2770 else { 2771 lea(rscratch, src); 2772 vmovdqa(dst, Address(rscratch, 0)); 2773 } 2774 } 2775 2776 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2777 assert(rscratch != noreg || always_reachable(src), "missing"); 2778 2779 if (vector_len == AVX_512bit) { 2780 evmovdqaq(dst, src, AVX_512bit, rscratch); 2781 } else if (vector_len == AVX_256bit) { 2782 vmovdqa(dst, src, rscratch); 2783 } else { 2784 movdqa(dst, src, rscratch); 2785 } 2786 } 2787 2788 void MacroAssembler::kmov(KRegister dst, Address src) { 2789 if (VM_Version::supports_avx512bw()) { 2790 kmovql(dst, src); 2791 } else { 2792 assert(VM_Version::supports_evex(), ""); 2793 kmovwl(dst, src); 2794 } 2795 } 2796 2797 void MacroAssembler::kmov(Address dst, KRegister src) { 2798 if (VM_Version::supports_avx512bw()) { 2799 kmovql(dst, src); 2800 } else { 2801 assert(VM_Version::supports_evex(), ""); 2802 kmovwl(dst, src); 2803 } 2804 } 2805 2806 void MacroAssembler::kmov(KRegister dst, KRegister src) { 2807 if (VM_Version::supports_avx512bw()) { 2808 kmovql(dst, src); 2809 } else { 2810 assert(VM_Version::supports_evex(), ""); 2811 kmovwl(dst, src); 2812 } 2813 } 2814 2815 void MacroAssembler::kmov(Register dst, KRegister src) { 2816 if (VM_Version::supports_avx512bw()) { 2817 kmovql(dst, src); 2818 } else { 2819 assert(VM_Version::supports_evex(), ""); 2820 kmovwl(dst, src); 2821 } 2822 } 2823 2824 void MacroAssembler::kmov(KRegister dst, Register src) { 2825 if (VM_Version::supports_avx512bw()) { 2826 kmovql(dst, src); 2827 } else { 2828 assert(VM_Version::supports_evex(), ""); 2829 kmovwl(dst, src); 2830 } 2831 } 2832 2833 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) { 2834 assert(rscratch != noreg || always_reachable(src), "missing"); 2835 2836 if (reachable(src)) { 2837 kmovql(dst, as_Address(src)); 2838 } else { 2839 lea(rscratch, src); 2840 kmovql(dst, Address(rscratch, 0)); 2841 } 2842 } 2843 2844 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) { 2845 assert(rscratch != noreg || always_reachable(src), "missing"); 2846 2847 if (reachable(src)) { 2848 kmovwl(dst, as_Address(src)); 2849 } else { 2850 lea(rscratch, src); 2851 kmovwl(dst, Address(rscratch, 0)); 2852 } 2853 } 2854 2855 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2856 int vector_len, Register rscratch) { 2857 assert(rscratch != noreg || always_reachable(src), "missing"); 2858 2859 if (reachable(src)) { 2860 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len); 2861 } else { 2862 lea(rscratch, src); 2863 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len); 2864 } 2865 } 2866 2867 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2868 int vector_len, Register rscratch) { 2869 assert(rscratch != noreg || always_reachable(src), "missing"); 2870 2871 if (reachable(src)) { 2872 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len); 2873 } else { 2874 lea(rscratch, src); 2875 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len); 2876 } 2877 } 2878 2879 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2880 assert(rscratch != noreg || always_reachable(src), "missing"); 2881 2882 if (reachable(src)) { 2883 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len); 2884 } else { 2885 lea(rscratch, src); 2886 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len); 2887 } 2888 } 2889 2890 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2891 assert(rscratch != noreg || always_reachable(src), "missing"); 2892 2893 if (reachable(src)) { 2894 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len); 2895 } else { 2896 lea(rscratch, src); 2897 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len); 2898 } 2899 } 2900 2901 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2902 assert(rscratch != noreg || always_reachable(src), "missing"); 2903 2904 if (reachable(src)) { 2905 Assembler::evmovdquq(dst, as_Address(src), vector_len); 2906 } else { 2907 lea(rscratch, src); 2908 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len); 2909 } 2910 } 2911 2912 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2913 assert(rscratch != noreg || always_reachable(src), "missing"); 2914 2915 if (reachable(src)) { 2916 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len); 2917 } else { 2918 lea(rscratch, src); 2919 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len); 2920 } 2921 } 2922 2923 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2924 assert(rscratch != noreg || always_reachable(src), "missing"); 2925 2926 if (reachable(src)) { 2927 Assembler::evmovdqaq(dst, as_Address(src), vector_len); 2928 } else { 2929 lea(rscratch, src); 2930 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len); 2931 } 2932 } 2933 2934 2935 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) { 2936 assert(rscratch != noreg || always_reachable(src), "missing"); 2937 2938 if (reachable(src)) { 2939 Assembler::movdqa(dst, as_Address(src)); 2940 } else { 2941 lea(rscratch, src); 2942 Assembler::movdqa(dst, Address(rscratch, 0)); 2943 } 2944 } 2945 2946 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2947 assert(rscratch != noreg || always_reachable(src), "missing"); 2948 2949 if (reachable(src)) { 2950 Assembler::movsd(dst, as_Address(src)); 2951 } else { 2952 lea(rscratch, src); 2953 Assembler::movsd(dst, Address(rscratch, 0)); 2954 } 2955 } 2956 2957 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2958 assert(rscratch != noreg || always_reachable(src), "missing"); 2959 2960 if (reachable(src)) { 2961 Assembler::movss(dst, as_Address(src)); 2962 } else { 2963 lea(rscratch, src); 2964 Assembler::movss(dst, Address(rscratch, 0)); 2965 } 2966 } 2967 2968 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) { 2969 assert(rscratch != noreg || always_reachable(src), "missing"); 2970 2971 if (reachable(src)) { 2972 Assembler::movddup(dst, as_Address(src)); 2973 } else { 2974 lea(rscratch, src); 2975 Assembler::movddup(dst, Address(rscratch, 0)); 2976 } 2977 } 2978 2979 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2980 assert(rscratch != noreg || always_reachable(src), "missing"); 2981 2982 if (reachable(src)) { 2983 Assembler::vmovddup(dst, as_Address(src), vector_len); 2984 } else { 2985 lea(rscratch, src); 2986 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len); 2987 } 2988 } 2989 2990 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2991 assert(rscratch != noreg || always_reachable(src), "missing"); 2992 2993 if (reachable(src)) { 2994 Assembler::mulsd(dst, as_Address(src)); 2995 } else { 2996 lea(rscratch, src); 2997 Assembler::mulsd(dst, Address(rscratch, 0)); 2998 } 2999 } 3000 3001 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3002 assert(rscratch != noreg || always_reachable(src), "missing"); 3003 3004 if (reachable(src)) { 3005 Assembler::mulss(dst, as_Address(src)); 3006 } else { 3007 lea(rscratch, src); 3008 Assembler::mulss(dst, Address(rscratch, 0)); 3009 } 3010 } 3011 3012 void MacroAssembler::null_check(Register reg, int offset) { 3013 if (needs_explicit_null_check(offset)) { 3014 // provoke OS null exception if reg is null by 3015 // accessing M[reg] w/o changing any (non-CC) registers 3016 // NOTE: cmpl is plenty here to provoke a segv 3017 cmpptr(rax, Address(reg, 0)); 3018 // Note: should probably use testl(rax, Address(reg, 0)); 3019 // may be shorter code (however, this version of 3020 // testl needs to be implemented first) 3021 } else { 3022 // nothing to do, (later) access of M[reg + offset] 3023 // will provoke OS null exception if reg is null 3024 } 3025 } 3026 3027 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) { 3028 andptr(markword, markWord::inline_type_mask_in_place); 3029 cmpptr(markword, markWord::inline_type_pattern); 3030 jcc(Assembler::equal, is_inline_type); 3031 } 3032 3033 void MacroAssembler::test_klass_is_inline_type(Register klass, Register temp_reg, Label& is_inline_type) { 3034 load_unsigned_short(temp_reg, Address(klass, Klass::access_flags_offset())); 3035 testl(temp_reg, JVM_ACC_IDENTITY); 3036 jcc(Assembler::zero, is_inline_type); 3037 } 3038 3039 void MacroAssembler::test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type) { 3040 testptr(object, object); 3041 jcc(Assembler::zero, not_inline_type); 3042 const int is_inline_type_mask = markWord::inline_type_pattern; 3043 movptr(tmp, Address(object, oopDesc::mark_offset_in_bytes())); 3044 andptr(tmp, is_inline_type_mask); 3045 cmpptr(tmp, is_inline_type_mask); 3046 jcc(Assembler::notEqual, not_inline_type); 3047 } 3048 3049 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free_inline_type) { 3050 movl(temp_reg, flags); 3051 testl(temp_reg, 1 << ResolvedFieldEntry::is_null_free_inline_type_shift); 3052 jcc(Assembler::notEqual, is_null_free_inline_type); 3053 } 3054 3055 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free_inline_type) { 3056 movl(temp_reg, flags); 3057 testl(temp_reg, 1 << ResolvedFieldEntry::is_null_free_inline_type_shift); 3058 jcc(Assembler::equal, not_null_free_inline_type); 3059 } 3060 3061 void MacroAssembler::test_field_is_flat(Register flags, Register temp_reg, Label& is_flat) { 3062 movl(temp_reg, flags); 3063 testl(temp_reg, 1 << ResolvedFieldEntry::is_flat_shift); 3064 jcc(Assembler::notEqual, is_flat); 3065 } 3066 3067 void MacroAssembler::test_field_has_null_marker(Register flags, Register temp_reg, Label& has_null_marker) { 3068 movl(temp_reg, flags); 3069 testl(temp_reg, 1 << ResolvedFieldEntry::has_null_marker_shift); 3070 jcc(Assembler::notEqual, has_null_marker); 3071 } 3072 3073 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set, Label& jmp_label) { 3074 Label test_mark_word; 3075 // load mark word 3076 movptr(temp_reg, Address(oop, oopDesc::mark_offset_in_bytes())); 3077 // check displaced 3078 testl(temp_reg, markWord::unlocked_value); 3079 jccb(Assembler::notZero, test_mark_word); 3080 // slow path use klass prototype 3081 push(rscratch1); 3082 load_prototype_header(temp_reg, oop, rscratch1); 3083 pop(rscratch1); 3084 3085 bind(test_mark_word); 3086 testl(temp_reg, test_bit); 3087 jcc((jmp_set) ? Assembler::notZero : Assembler::zero, jmp_label); 3088 } 3089 3090 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, 3091 Label& is_flat_array) { 3092 #ifdef _LP64 3093 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array); 3094 #else 3095 load_klass(temp_reg, oop, noreg); 3096 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset())); 3097 test_flat_array_layout(temp_reg, is_flat_array); 3098 #endif 3099 } 3100 3101 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg, 3102 Label& is_non_flat_array) { 3103 #ifdef _LP64 3104 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array); 3105 #else 3106 load_klass(temp_reg, oop, noreg); 3107 movl(temp_reg, Address(temp_reg, Klass::layout_helper_offset())); 3108 test_non_flat_array_layout(temp_reg, is_non_flat_array); 3109 #endif 3110 } 3111 3112 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array) { 3113 #ifdef _LP64 3114 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array); 3115 #else 3116 Unimplemented(); 3117 #endif 3118 } 3119 3120 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) { 3121 #ifdef _LP64 3122 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array); 3123 #else 3124 Unimplemented(); 3125 #endif 3126 } 3127 3128 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) { 3129 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace); 3130 jcc(Assembler::notZero, is_flat_array); 3131 } 3132 3133 void MacroAssembler::test_non_flat_array_layout(Register lh, Label& is_non_flat_array) { 3134 testl(lh, Klass::_lh_array_tag_flat_value_bit_inplace); 3135 jcc(Assembler::zero, is_non_flat_array); 3136 } 3137 3138 void MacroAssembler::os_breakpoint() { 3139 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 3140 // (e.g., MSVC can't call ps() otherwise) 3141 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); 3142 } 3143 3144 void MacroAssembler::unimplemented(const char* what) { 3145 const char* buf = nullptr; 3146 { 3147 ResourceMark rm; 3148 stringStream ss; 3149 ss.print("unimplemented: %s", what); 3150 buf = code_string(ss.as_string()); 3151 } 3152 stop(buf); 3153 } 3154 3155 #ifdef _LP64 3156 #define XSTATE_BV 0x200 3157 #endif 3158 3159 void MacroAssembler::pop_CPU_state() { 3160 pop_FPU_state(); 3161 pop_IU_state(); 3162 } 3163 3164 void MacroAssembler::pop_FPU_state() { 3165 #ifndef _LP64 3166 frstor(Address(rsp, 0)); 3167 #else 3168 fxrstor(Address(rsp, 0)); 3169 #endif 3170 addptr(rsp, FPUStateSizeInWords * wordSize); 3171 } 3172 3173 void MacroAssembler::pop_IU_state() { 3174 popa(); 3175 LP64_ONLY(addq(rsp, 8)); 3176 popf(); 3177 } 3178 3179 // Save Integer and Float state 3180 // Warning: Stack must be 16 byte aligned (64bit) 3181 void MacroAssembler::push_CPU_state() { 3182 push_IU_state(); 3183 push_FPU_state(); 3184 } 3185 3186 void MacroAssembler::push_FPU_state() { 3187 subptr(rsp, FPUStateSizeInWords * wordSize); 3188 #ifndef _LP64 3189 fnsave(Address(rsp, 0)); 3190 fwait(); 3191 #else 3192 fxsave(Address(rsp, 0)); 3193 #endif // LP64 3194 } 3195 3196 void MacroAssembler::push_IU_state() { 3197 // Push flags first because pusha kills them 3198 pushf(); 3199 // Make sure rsp stays 16-byte aligned 3200 LP64_ONLY(subq(rsp, 8)); 3201 pusha(); 3202 } 3203 3204 void MacroAssembler::push_cont_fastpath() { 3205 if (!Continuations::enabled()) return; 3206 3207 #ifndef _LP64 3208 Register rthread = rax; 3209 Register rrealsp = rbx; 3210 push(rthread); 3211 push(rrealsp); 3212 3213 get_thread(rthread); 3214 3215 // The code below wants the original RSP. 3216 // Move it back after the pushes above. 3217 movptr(rrealsp, rsp); 3218 addptr(rrealsp, 2*wordSize); 3219 #else 3220 Register rthread = r15_thread; 3221 Register rrealsp = rsp; 3222 #endif 3223 3224 Label done; 3225 cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset())); 3226 jccb(Assembler::belowEqual, done); 3227 movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp); 3228 bind(done); 3229 3230 #ifndef _LP64 3231 pop(rrealsp); 3232 pop(rthread); 3233 #endif 3234 } 3235 3236 void MacroAssembler::pop_cont_fastpath() { 3237 if (!Continuations::enabled()) return; 3238 3239 #ifndef _LP64 3240 Register rthread = rax; 3241 Register rrealsp = rbx; 3242 push(rthread); 3243 push(rrealsp); 3244 3245 get_thread(rthread); 3246 3247 // The code below wants the original RSP. 3248 // Move it back after the pushes above. 3249 movptr(rrealsp, rsp); 3250 addptr(rrealsp, 2*wordSize); 3251 #else 3252 Register rthread = r15_thread; 3253 Register rrealsp = rsp; 3254 #endif 3255 3256 Label done; 3257 cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset())); 3258 jccb(Assembler::below, done); 3259 movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0); 3260 bind(done); 3261 3262 #ifndef _LP64 3263 pop(rrealsp); 3264 pop(rthread); 3265 #endif 3266 } 3267 3268 void MacroAssembler::inc_held_monitor_count() { 3269 #ifdef _LP64 3270 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 3271 #endif 3272 } 3273 3274 void MacroAssembler::dec_held_monitor_count() { 3275 #ifdef _LP64 3276 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 3277 #endif 3278 } 3279 3280 #ifdef ASSERT 3281 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) { 3282 #ifdef _LP64 3283 Label no_cont; 3284 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset())); 3285 testl(cont, cont); 3286 jcc(Assembler::zero, no_cont); 3287 stop(name); 3288 bind(no_cont); 3289 #else 3290 Unimplemented(); 3291 #endif 3292 } 3293 #endif 3294 3295 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register 3296 if (!java_thread->is_valid()) { 3297 java_thread = rdi; 3298 get_thread(java_thread); 3299 } 3300 // we must set sp to zero to clear frame 3301 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 3302 // must clear fp, so that compiled frames are not confused; it is 3303 // possible that we need it only for debugging 3304 if (clear_fp) { 3305 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 3306 } 3307 // Always clear the pc because it could have been set by make_walkable() 3308 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 3309 vzeroupper(); 3310 } 3311 3312 void MacroAssembler::restore_rax(Register tmp) { 3313 if (tmp == noreg) pop(rax); 3314 else if (tmp != rax) mov(rax, tmp); 3315 } 3316 3317 void MacroAssembler::round_to(Register reg, int modulus) { 3318 addptr(reg, modulus - 1); 3319 andptr(reg, -modulus); 3320 } 3321 3322 void MacroAssembler::save_rax(Register tmp) { 3323 if (tmp == noreg) push(rax); 3324 else if (tmp != rax) mov(tmp, rax); 3325 } 3326 3327 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) { 3328 if (at_return) { 3329 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore, 3330 // we may safely use rsp instead to perform the stack watermark check. 3331 cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset())); 3332 jcc(Assembler::above, slow_path); 3333 return; 3334 } 3335 testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 3336 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll 3337 } 3338 3339 // Calls to C land 3340 // 3341 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded 3342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 3343 // has to be reset to 0. This is required to allow proper stack traversal. 3344 void MacroAssembler::set_last_Java_frame(Register java_thread, 3345 Register last_java_sp, 3346 Register last_java_fp, 3347 address last_java_pc, 3348 Register rscratch) { 3349 vzeroupper(); 3350 // determine java_thread register 3351 if (!java_thread->is_valid()) { 3352 java_thread = rdi; 3353 get_thread(java_thread); 3354 } 3355 // determine last_java_sp register 3356 if (!last_java_sp->is_valid()) { 3357 last_java_sp = rsp; 3358 } 3359 // last_java_fp is optional 3360 if (last_java_fp->is_valid()) { 3361 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp); 3362 } 3363 // last_java_pc is optional 3364 if (last_java_pc != nullptr) { 3365 Address java_pc(java_thread, 3366 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); 3367 lea(java_pc, InternalAddress(last_java_pc), rscratch); 3368 } 3369 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 3370 } 3371 3372 #ifdef _LP64 3373 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 3374 Register last_java_fp, 3375 Label &L, 3376 Register scratch) { 3377 lea(scratch, L); 3378 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch); 3379 set_last_Java_frame(r15_thread, last_java_sp, last_java_fp, nullptr, scratch); 3380 } 3381 #endif 3382 3383 void MacroAssembler::shlptr(Register dst, int imm8) { 3384 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); 3385 } 3386 3387 void MacroAssembler::shrptr(Register dst, int imm8) { 3388 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); 3389 } 3390 3391 void MacroAssembler::sign_extend_byte(Register reg) { 3392 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { 3393 movsbl(reg, reg); // movsxb 3394 } else { 3395 shll(reg, 24); 3396 sarl(reg, 24); 3397 } 3398 } 3399 3400 void MacroAssembler::sign_extend_short(Register reg) { 3401 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3402 movswl(reg, reg); // movsxw 3403 } else { 3404 shll(reg, 16); 3405 sarl(reg, 16); 3406 } 3407 } 3408 3409 void MacroAssembler::testl(Address dst, int32_t imm32) { 3410 if (imm32 >= 0 && is8bit(imm32)) { 3411 testb(dst, imm32); 3412 } else { 3413 Assembler::testl(dst, imm32); 3414 } 3415 } 3416 3417 void MacroAssembler::testl(Register dst, int32_t imm32) { 3418 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) { 3419 testb(dst, imm32); 3420 } else { 3421 Assembler::testl(dst, imm32); 3422 } 3423 } 3424 3425 void MacroAssembler::testl(Register dst, AddressLiteral src) { 3426 assert(always_reachable(src), "Address should be reachable"); 3427 testl(dst, as_Address(src)); 3428 } 3429 3430 #ifdef _LP64 3431 3432 void MacroAssembler::testq(Address dst, int32_t imm32) { 3433 if (imm32 >= 0) { 3434 testl(dst, imm32); 3435 } else { 3436 Assembler::testq(dst, imm32); 3437 } 3438 } 3439 3440 void MacroAssembler::testq(Register dst, int32_t imm32) { 3441 if (imm32 >= 0) { 3442 testl(dst, imm32); 3443 } else { 3444 Assembler::testq(dst, imm32); 3445 } 3446 } 3447 3448 #endif 3449 3450 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) { 3451 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3452 Assembler::pcmpeqb(dst, src); 3453 } 3454 3455 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) { 3456 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3457 Assembler::pcmpeqw(dst, src); 3458 } 3459 3460 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) { 3461 assert((dst->encoding() < 16),"XMM register should be 0-15"); 3462 Assembler::pcmpestri(dst, src, imm8); 3463 } 3464 3465 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { 3466 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3467 Assembler::pcmpestri(dst, src, imm8); 3468 } 3469 3470 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) { 3471 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3472 Assembler::pmovzxbw(dst, src); 3473 } 3474 3475 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) { 3476 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3477 Assembler::pmovzxbw(dst, src); 3478 } 3479 3480 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) { 3481 assert((src->encoding() < 16),"XMM register should be 0-15"); 3482 Assembler::pmovmskb(dst, src); 3483 } 3484 3485 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) { 3486 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3487 Assembler::ptest(dst, src); 3488 } 3489 3490 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3491 assert(rscratch != noreg || always_reachable(src), "missing"); 3492 3493 if (reachable(src)) { 3494 Assembler::sqrtss(dst, as_Address(src)); 3495 } else { 3496 lea(rscratch, src); 3497 Assembler::sqrtss(dst, Address(rscratch, 0)); 3498 } 3499 } 3500 3501 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3502 assert(rscratch != noreg || always_reachable(src), "missing"); 3503 3504 if (reachable(src)) { 3505 Assembler::subsd(dst, as_Address(src)); 3506 } else { 3507 lea(rscratch, src); 3508 Assembler::subsd(dst, Address(rscratch, 0)); 3509 } 3510 } 3511 3512 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) { 3513 assert(rscratch != noreg || always_reachable(src), "missing"); 3514 3515 if (reachable(src)) { 3516 Assembler::roundsd(dst, as_Address(src), rmode); 3517 } else { 3518 lea(rscratch, src); 3519 Assembler::roundsd(dst, Address(rscratch, 0), rmode); 3520 } 3521 } 3522 3523 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3524 assert(rscratch != noreg || always_reachable(src), "missing"); 3525 3526 if (reachable(src)) { 3527 Assembler::subss(dst, as_Address(src)); 3528 } else { 3529 lea(rscratch, src); 3530 Assembler::subss(dst, Address(rscratch, 0)); 3531 } 3532 } 3533 3534 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3535 assert(rscratch != noreg || always_reachable(src), "missing"); 3536 3537 if (reachable(src)) { 3538 Assembler::ucomisd(dst, as_Address(src)); 3539 } else { 3540 lea(rscratch, src); 3541 Assembler::ucomisd(dst, Address(rscratch, 0)); 3542 } 3543 } 3544 3545 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3546 assert(rscratch != noreg || always_reachable(src), "missing"); 3547 3548 if (reachable(src)) { 3549 Assembler::ucomiss(dst, as_Address(src)); 3550 } else { 3551 lea(rscratch, src); 3552 Assembler::ucomiss(dst, Address(rscratch, 0)); 3553 } 3554 } 3555 3556 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3557 assert(rscratch != noreg || always_reachable(src), "missing"); 3558 3559 // Used in sign-bit flipping with aligned address. 3560 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3561 3562 if (UseAVX > 2 && 3563 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3564 (dst->encoding() >= 16)) { 3565 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch); 3566 } else if (reachable(src)) { 3567 Assembler::xorpd(dst, as_Address(src)); 3568 } else { 3569 lea(rscratch, src); 3570 Assembler::xorpd(dst, Address(rscratch, 0)); 3571 } 3572 } 3573 3574 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) { 3575 if (UseAVX > 2 && 3576 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3577 ((dst->encoding() >= 16) || (src->encoding() >= 16))) { 3578 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3579 } else { 3580 Assembler::xorpd(dst, src); 3581 } 3582 } 3583 3584 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) { 3585 if (UseAVX > 2 && 3586 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3587 ((dst->encoding() >= 16) || (src->encoding() >= 16))) { 3588 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3589 } else { 3590 Assembler::xorps(dst, src); 3591 } 3592 } 3593 3594 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) { 3595 assert(rscratch != noreg || always_reachable(src), "missing"); 3596 3597 // Used in sign-bit flipping with aligned address. 3598 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3599 3600 if (UseAVX > 2 && 3601 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3602 (dst->encoding() >= 16)) { 3603 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch); 3604 } else if (reachable(src)) { 3605 Assembler::xorps(dst, as_Address(src)); 3606 } else { 3607 lea(rscratch, src); 3608 Assembler::xorps(dst, Address(rscratch, 0)); 3609 } 3610 } 3611 3612 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) { 3613 assert(rscratch != noreg || always_reachable(src), "missing"); 3614 3615 // Used in sign-bit flipping with aligned address. 3616 bool aligned_adr = (((intptr_t)src.target() & 15) == 0); 3617 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); 3618 if (reachable(src)) { 3619 Assembler::pshufb(dst, as_Address(src)); 3620 } else { 3621 lea(rscratch, src); 3622 Assembler::pshufb(dst, Address(rscratch, 0)); 3623 } 3624 } 3625 3626 // AVX 3-operands instructions 3627 3628 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3629 assert(rscratch != noreg || always_reachable(src), "missing"); 3630 3631 if (reachable(src)) { 3632 vaddsd(dst, nds, as_Address(src)); 3633 } else { 3634 lea(rscratch, src); 3635 vaddsd(dst, nds, Address(rscratch, 0)); 3636 } 3637 } 3638 3639 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3640 assert(rscratch != noreg || always_reachable(src), "missing"); 3641 3642 if (reachable(src)) { 3643 vaddss(dst, nds, as_Address(src)); 3644 } else { 3645 lea(rscratch, src); 3646 vaddss(dst, nds, Address(rscratch, 0)); 3647 } 3648 } 3649 3650 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3651 assert(UseAVX > 0, "requires some form of AVX"); 3652 assert(rscratch != noreg || always_reachable(src), "missing"); 3653 3654 if (reachable(src)) { 3655 Assembler::vpaddb(dst, nds, as_Address(src), vector_len); 3656 } else { 3657 lea(rscratch, src); 3658 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len); 3659 } 3660 } 3661 3662 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3663 assert(UseAVX > 0, "requires some form of AVX"); 3664 assert(rscratch != noreg || always_reachable(src), "missing"); 3665 3666 if (reachable(src)) { 3667 Assembler::vpaddd(dst, nds, as_Address(src), vector_len); 3668 } else { 3669 lea(rscratch, src); 3670 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len); 3671 } 3672 } 3673 3674 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) { 3675 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3676 assert(rscratch != noreg || always_reachable(negate_field), "missing"); 3677 3678 vandps(dst, nds, negate_field, vector_len, rscratch); 3679 } 3680 3681 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) { 3682 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3683 assert(rscratch != noreg || always_reachable(negate_field), "missing"); 3684 3685 vandpd(dst, nds, negate_field, vector_len, rscratch); 3686 } 3687 3688 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3689 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3690 Assembler::vpaddb(dst, nds, src, vector_len); 3691 } 3692 3693 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3694 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3695 Assembler::vpaddb(dst, nds, src, vector_len); 3696 } 3697 3698 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3699 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3700 Assembler::vpaddw(dst, nds, src, vector_len); 3701 } 3702 3703 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3704 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3705 Assembler::vpaddw(dst, nds, src, vector_len); 3706 } 3707 3708 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3709 assert(rscratch != noreg || always_reachable(src), "missing"); 3710 3711 if (reachable(src)) { 3712 Assembler::vpand(dst, nds, as_Address(src), vector_len); 3713 } else { 3714 lea(rscratch, src); 3715 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len); 3716 } 3717 } 3718 3719 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3720 assert(rscratch != noreg || always_reachable(src), "missing"); 3721 3722 if (reachable(src)) { 3723 Assembler::vpbroadcastd(dst, as_Address(src), vector_len); 3724 } else { 3725 lea(rscratch, src); 3726 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len); 3727 } 3728 } 3729 3730 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3731 assert(rscratch != noreg || always_reachable(src), "missing"); 3732 3733 if (reachable(src)) { 3734 Assembler::vbroadcasti128(dst, as_Address(src), vector_len); 3735 } else { 3736 lea(rscratch, src); 3737 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len); 3738 } 3739 } 3740 3741 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3742 assert(rscratch != noreg || always_reachable(src), "missing"); 3743 3744 if (reachable(src)) { 3745 Assembler::vpbroadcastq(dst, as_Address(src), vector_len); 3746 } else { 3747 lea(rscratch, src); 3748 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len); 3749 } 3750 } 3751 3752 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3753 assert(rscratch != noreg || always_reachable(src), "missing"); 3754 3755 if (reachable(src)) { 3756 Assembler::vbroadcastsd(dst, as_Address(src), vector_len); 3757 } else { 3758 lea(rscratch, src); 3759 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len); 3760 } 3761 } 3762 3763 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3764 assert(rscratch != noreg || always_reachable(src), "missing"); 3765 3766 if (reachable(src)) { 3767 Assembler::vbroadcastss(dst, as_Address(src), vector_len); 3768 } else { 3769 lea(rscratch, src); 3770 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len); 3771 } 3772 } 3773 3774 // Vector float blend 3775 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg) 3776 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) { 3777 // WARN: Allow dst == (src1|src2), mask == scratch 3778 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1; 3779 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst; 3780 bool dst_available = dst != mask && (dst != src1 || dst != src2); 3781 if (blend_emulation && scratch_available && dst_available) { 3782 if (compute_mask) { 3783 vpsrad(scratch, mask, 32, vector_len); 3784 mask = scratch; 3785 } 3786 if (dst == src1) { 3787 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1 3788 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2 3789 } else { 3790 vpand (dst, mask, src2, vector_len); // if mask == 1, src2 3791 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1 3792 } 3793 vpor(dst, dst, scratch, vector_len); 3794 } else { 3795 Assembler::vblendvps(dst, src1, src2, mask, vector_len); 3796 } 3797 } 3798 3799 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg) 3800 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) { 3801 // WARN: Allow dst == (src1|src2), mask == scratch 3802 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1; 3803 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask); 3804 bool dst_available = dst != mask && (dst != src1 || dst != src2); 3805 if (blend_emulation && scratch_available && dst_available) { 3806 if (compute_mask) { 3807 vpxor(scratch, scratch, scratch, vector_len); 3808 vpcmpgtq(scratch, scratch, mask, vector_len); 3809 mask = scratch; 3810 } 3811 if (dst == src1) { 3812 vpandn(dst, mask, src1, vector_len); // if mask == 0, src 3813 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2 3814 } else { 3815 vpand (dst, mask, src2, vector_len); // if mask == 1, src2 3816 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src 3817 } 3818 vpor(dst, dst, scratch, vector_len); 3819 } else { 3820 Assembler::vblendvpd(dst, src1, src2, mask, vector_len); 3821 } 3822 } 3823 3824 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3825 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3826 Assembler::vpcmpeqb(dst, nds, src, vector_len); 3827 } 3828 3829 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) { 3830 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3831 Assembler::vpcmpeqb(dst, src1, src2, vector_len); 3832 } 3833 3834 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3835 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3836 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3837 } 3838 3839 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3840 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3841 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3842 } 3843 3844 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3845 assert(rscratch != noreg || always_reachable(src), "missing"); 3846 3847 if (reachable(src)) { 3848 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len); 3849 } else { 3850 lea(rscratch, src); 3851 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len); 3852 } 3853 } 3854 3855 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3856 int comparison, bool is_signed, int vector_len, Register rscratch) { 3857 assert(rscratch != noreg || always_reachable(src), "missing"); 3858 3859 if (reachable(src)) { 3860 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3861 } else { 3862 lea(rscratch, src); 3863 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3864 } 3865 } 3866 3867 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3868 int comparison, bool is_signed, int vector_len, Register rscratch) { 3869 assert(rscratch != noreg || always_reachable(src), "missing"); 3870 3871 if (reachable(src)) { 3872 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3873 } else { 3874 lea(rscratch, src); 3875 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3876 } 3877 } 3878 3879 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3880 int comparison, bool is_signed, int vector_len, Register rscratch) { 3881 assert(rscratch != noreg || always_reachable(src), "missing"); 3882 3883 if (reachable(src)) { 3884 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3885 } else { 3886 lea(rscratch, src); 3887 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3888 } 3889 } 3890 3891 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3892 int comparison, bool is_signed, int vector_len, Register rscratch) { 3893 assert(rscratch != noreg || always_reachable(src), "missing"); 3894 3895 if (reachable(src)) { 3896 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3897 } else { 3898 lea(rscratch, src); 3899 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3900 } 3901 } 3902 3903 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) { 3904 if (width == Assembler::Q) { 3905 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len); 3906 } else { 3907 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len); 3908 } 3909 } 3910 3911 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) { 3912 int eq_cond_enc = 0x29; 3913 int gt_cond_enc = 0x37; 3914 if (width != Assembler::Q) { 3915 eq_cond_enc = 0x74 + width; 3916 gt_cond_enc = 0x64 + width; 3917 } 3918 switch (cond) { 3919 case eq: 3920 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3921 break; 3922 case neq: 3923 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3924 vallones(xtmp, vector_len); 3925 vpxor(dst, xtmp, dst, vector_len); 3926 break; 3927 case le: 3928 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3929 vallones(xtmp, vector_len); 3930 vpxor(dst, xtmp, dst, vector_len); 3931 break; 3932 case nlt: 3933 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3934 vallones(xtmp, vector_len); 3935 vpxor(dst, xtmp, dst, vector_len); 3936 break; 3937 case lt: 3938 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3939 break; 3940 case nle: 3941 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3942 break; 3943 default: 3944 assert(false, "Should not reach here"); 3945 } 3946 } 3947 3948 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { 3949 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3950 Assembler::vpmovzxbw(dst, src, vector_len); 3951 } 3952 3953 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) { 3954 assert((src->encoding() < 16),"XMM register should be 0-15"); 3955 Assembler::vpmovmskb(dst, src, vector_len); 3956 } 3957 3958 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3959 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3960 Assembler::vpmullw(dst, nds, src, vector_len); 3961 } 3962 3963 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3964 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3965 Assembler::vpmullw(dst, nds, src, vector_len); 3966 } 3967 3968 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3969 assert((UseAVX > 0), "AVX support is needed"); 3970 assert(rscratch != noreg || always_reachable(src), "missing"); 3971 3972 if (reachable(src)) { 3973 Assembler::vpmulld(dst, nds, as_Address(src), vector_len); 3974 } else { 3975 lea(rscratch, src); 3976 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len); 3977 } 3978 } 3979 3980 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3981 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3982 Assembler::vpsubb(dst, nds, src, vector_len); 3983 } 3984 3985 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3986 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3987 Assembler::vpsubb(dst, nds, src, vector_len); 3988 } 3989 3990 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3991 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3992 Assembler::vpsubw(dst, nds, src, vector_len); 3993 } 3994 3995 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3996 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3997 Assembler::vpsubw(dst, nds, src, vector_len); 3998 } 3999 4000 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 4001 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4002 Assembler::vpsraw(dst, nds, shift, vector_len); 4003 } 4004 4005 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 4006 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4007 Assembler::vpsraw(dst, nds, shift, vector_len); 4008 } 4009 4010 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 4011 assert(UseAVX > 2,""); 4012 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 4013 vector_len = 2; 4014 } 4015 Assembler::evpsraq(dst, nds, shift, vector_len); 4016 } 4017 4018 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 4019 assert(UseAVX > 2,""); 4020 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 4021 vector_len = 2; 4022 } 4023 Assembler::evpsraq(dst, nds, shift, vector_len); 4024 } 4025 4026 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 4027 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4028 Assembler::vpsrlw(dst, nds, shift, vector_len); 4029 } 4030 4031 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 4032 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4033 Assembler::vpsrlw(dst, nds, shift, vector_len); 4034 } 4035 4036 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 4037 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4038 Assembler::vpsllw(dst, nds, shift, vector_len); 4039 } 4040 4041 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 4042 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4043 Assembler::vpsllw(dst, nds, shift, vector_len); 4044 } 4045 4046 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) { 4047 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 4048 Assembler::vptest(dst, src); 4049 } 4050 4051 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) { 4052 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4053 Assembler::punpcklbw(dst, src); 4054 } 4055 4056 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) { 4057 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 4058 Assembler::pshufd(dst, src, mode); 4059 } 4060 4061 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { 4062 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 4063 Assembler::pshuflw(dst, src, mode); 4064 } 4065 4066 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4067 assert(rscratch != noreg || always_reachable(src), "missing"); 4068 4069 if (reachable(src)) { 4070 vandpd(dst, nds, as_Address(src), vector_len); 4071 } else { 4072 lea(rscratch, src); 4073 vandpd(dst, nds, Address(rscratch, 0), vector_len); 4074 } 4075 } 4076 4077 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4078 assert(rscratch != noreg || always_reachable(src), "missing"); 4079 4080 if (reachable(src)) { 4081 vandps(dst, nds, as_Address(src), vector_len); 4082 } else { 4083 lea(rscratch, src); 4084 vandps(dst, nds, Address(rscratch, 0), vector_len); 4085 } 4086 } 4087 4088 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, 4089 bool merge, int vector_len, Register rscratch) { 4090 assert(rscratch != noreg || always_reachable(src), "missing"); 4091 4092 if (reachable(src)) { 4093 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len); 4094 } else { 4095 lea(rscratch, src); 4096 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len); 4097 } 4098 } 4099 4100 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4101 assert(rscratch != noreg || always_reachable(src), "missing"); 4102 4103 if (reachable(src)) { 4104 vdivsd(dst, nds, as_Address(src)); 4105 } else { 4106 lea(rscratch, src); 4107 vdivsd(dst, nds, Address(rscratch, 0)); 4108 } 4109 } 4110 4111 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4112 assert(rscratch != noreg || always_reachable(src), "missing"); 4113 4114 if (reachable(src)) { 4115 vdivss(dst, nds, as_Address(src)); 4116 } else { 4117 lea(rscratch, src); 4118 vdivss(dst, nds, Address(rscratch, 0)); 4119 } 4120 } 4121 4122 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4123 assert(rscratch != noreg || always_reachable(src), "missing"); 4124 4125 if (reachable(src)) { 4126 vmulsd(dst, nds, as_Address(src)); 4127 } else { 4128 lea(rscratch, src); 4129 vmulsd(dst, nds, Address(rscratch, 0)); 4130 } 4131 } 4132 4133 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4134 assert(rscratch != noreg || always_reachable(src), "missing"); 4135 4136 if (reachable(src)) { 4137 vmulss(dst, nds, as_Address(src)); 4138 } else { 4139 lea(rscratch, src); 4140 vmulss(dst, nds, Address(rscratch, 0)); 4141 } 4142 } 4143 4144 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4145 assert(rscratch != noreg || always_reachable(src), "missing"); 4146 4147 if (reachable(src)) { 4148 vsubsd(dst, nds, as_Address(src)); 4149 } else { 4150 lea(rscratch, src); 4151 vsubsd(dst, nds, Address(rscratch, 0)); 4152 } 4153 } 4154 4155 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4156 assert(rscratch != noreg || always_reachable(src), "missing"); 4157 4158 if (reachable(src)) { 4159 vsubss(dst, nds, as_Address(src)); 4160 } else { 4161 lea(rscratch, src); 4162 vsubss(dst, nds, Address(rscratch, 0)); 4163 } 4164 } 4165 4166 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4167 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 4168 assert(rscratch != noreg || always_reachable(src), "missing"); 4169 4170 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch); 4171 } 4172 4173 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 4174 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 4175 assert(rscratch != noreg || always_reachable(src), "missing"); 4176 4177 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch); 4178 } 4179 4180 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4181 assert(rscratch != noreg || always_reachable(src), "missing"); 4182 4183 if (reachable(src)) { 4184 vxorpd(dst, nds, as_Address(src), vector_len); 4185 } else { 4186 lea(rscratch, src); 4187 vxorpd(dst, nds, Address(rscratch, 0), vector_len); 4188 } 4189 } 4190 4191 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4192 assert(rscratch != noreg || always_reachable(src), "missing"); 4193 4194 if (reachable(src)) { 4195 vxorps(dst, nds, as_Address(src), vector_len); 4196 } else { 4197 lea(rscratch, src); 4198 vxorps(dst, nds, Address(rscratch, 0), vector_len); 4199 } 4200 } 4201 4202 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4203 assert(rscratch != noreg || always_reachable(src), "missing"); 4204 4205 if (UseAVX > 1 || (vector_len < 1)) { 4206 if (reachable(src)) { 4207 Assembler::vpxor(dst, nds, as_Address(src), vector_len); 4208 } else { 4209 lea(rscratch, src); 4210 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len); 4211 } 4212 } else { 4213 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch); 4214 } 4215 } 4216 4217 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4218 assert(rscratch != noreg || always_reachable(src), "missing"); 4219 4220 if (reachable(src)) { 4221 Assembler::vpermd(dst, nds, as_Address(src), vector_len); 4222 } else { 4223 lea(rscratch, src); 4224 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len); 4225 } 4226 } 4227 4228 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) { 4229 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask); 4230 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code 4231 // The inverted mask is sign-extended 4232 andptr(possibly_non_local, inverted_mask); 4233 } 4234 4235 void MacroAssembler::resolve_jobject(Register value, 4236 Register thread, 4237 Register tmp) { 4238 assert_different_registers(value, thread, tmp); 4239 Label done, tagged, weak_tagged; 4240 testptr(value, value); 4241 jcc(Assembler::zero, done); // Use null as-is. 4242 testptr(value, JNIHandles::tag_mask); // Test for tag. 4243 jcc(Assembler::notZero, tagged); 4244 4245 // Resolve local handle 4246 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread); 4247 verify_oop(value); 4248 jmp(done); 4249 4250 bind(tagged); 4251 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag. 4252 jcc(Assembler::notZero, weak_tagged); 4253 4254 // Resolve global handle 4255 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread); 4256 verify_oop(value); 4257 jmp(done); 4258 4259 bind(weak_tagged); 4260 // Resolve jweak. 4261 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 4262 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread); 4263 verify_oop(value); 4264 4265 bind(done); 4266 } 4267 4268 void MacroAssembler::resolve_global_jobject(Register value, 4269 Register thread, 4270 Register tmp) { 4271 assert_different_registers(value, thread, tmp); 4272 Label done; 4273 4274 testptr(value, value); 4275 jcc(Assembler::zero, done); // Use null as-is. 4276 4277 #ifdef ASSERT 4278 { 4279 Label valid_global_tag; 4280 testptr(value, JNIHandles::TypeTag::global); // Test for global tag. 4281 jcc(Assembler::notZero, valid_global_tag); 4282 stop("non global jobject using resolve_global_jobject"); 4283 bind(valid_global_tag); 4284 } 4285 #endif 4286 4287 // Resolve global handle 4288 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread); 4289 verify_oop(value); 4290 4291 bind(done); 4292 } 4293 4294 void MacroAssembler::subptr(Register dst, int32_t imm32) { 4295 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); 4296 } 4297 4298 // Force generation of a 4 byte immediate value even if it fits into 8bit 4299 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { 4300 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); 4301 } 4302 4303 void MacroAssembler::subptr(Register dst, Register src) { 4304 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); 4305 } 4306 4307 // C++ bool manipulation 4308 void MacroAssembler::testbool(Register dst) { 4309 if(sizeof(bool) == 1) 4310 testb(dst, 0xff); 4311 else if(sizeof(bool) == 2) { 4312 // testw implementation needed for two byte bools 4313 ShouldNotReachHere(); 4314 } else if(sizeof(bool) == 4) 4315 testl(dst, dst); 4316 else 4317 // unsupported 4318 ShouldNotReachHere(); 4319 } 4320 4321 void MacroAssembler::testptr(Register dst, Register src) { 4322 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); 4323 } 4324 4325 // Object / value buffer allocation... 4326 // 4327 // Kills klass and rsi on LP64 4328 void MacroAssembler::allocate_instance(Register klass, Register new_obj, 4329 Register t1, Register t2, 4330 bool clear_fields, Label& alloc_failed) 4331 { 4332 Label done, initialize_header, initialize_object, slow_case, slow_case_no_pop; 4333 Register layout_size = t1; 4334 assert(new_obj == rax, "needs to be rax"); 4335 assert_different_registers(klass, new_obj, t1, t2); 4336 4337 // get instance_size in InstanceKlass (scaled to a count of bytes) 4338 movl(layout_size, Address(klass, Klass::layout_helper_offset())); 4339 // test to see if it is malformed in some way 4340 testl(layout_size, Klass::_lh_instance_slow_path_bit); 4341 jcc(Assembler::notZero, slow_case_no_pop); 4342 4343 // Allocate the instance: 4344 // If TLAB is enabled: 4345 // Try to allocate in the TLAB. 4346 // If fails, go to the slow path. 4347 // Else If inline contiguous allocations are enabled: 4348 // Try to allocate in eden. 4349 // If fails due to heap end, go to slow path. 4350 // 4351 // If TLAB is enabled OR inline contiguous is enabled: 4352 // Initialize the allocation. 4353 // Exit. 4354 // 4355 // Go to slow path. 4356 4357 push(klass); 4358 const Register thread = r15_thread; 4359 4360 if (UseTLAB) { 4361 tlab_allocate(thread, new_obj, layout_size, 0, klass, t2, slow_case); 4362 if (ZeroTLAB || (!clear_fields)) { 4363 // the fields have been already cleared 4364 jmp(initialize_header); 4365 } else { 4366 // initialize both the header and fields 4367 jmp(initialize_object); 4368 } 4369 } else { 4370 jmp(slow_case); 4371 } 4372 4373 // If UseTLAB is true, the object is created above and there is an initialize need. 4374 // Otherwise, skip and go to the slow path. 4375 if (UseTLAB) { 4376 if (clear_fields) { 4377 // The object is initialized before the header. If the object size is 4378 // zero, go directly to the header initialization. 4379 bind(initialize_object); 4380 if (UseCompactObjectHeaders) { 4381 assert(is_aligned(oopDesc::base_offset_in_bytes(), BytesPerLong), "oop base offset must be 8-byte-aligned"); 4382 decrement(layout_size, oopDesc::base_offset_in_bytes()); 4383 } else { 4384 decrement(layout_size, sizeof(oopDesc)); 4385 } 4386 jcc(Assembler::zero, initialize_header); 4387 4388 // Initialize topmost object field, divide size by 8, check if odd and 4389 // test if zero. 4390 Register zero = klass; 4391 xorl(zero, zero); // use zero reg to clear memory (shorter code) 4392 shrl(layout_size, LogBytesPerLong); // divide by 2*oopSize and set carry flag if odd 4393 4394 #ifdef ASSERT 4395 // make sure instance_size was multiple of 8 4396 Label L; 4397 // Ignore partial flag stall after shrl() since it is debug VM 4398 jcc(Assembler::carryClear, L); 4399 stop("object size is not multiple of 2 - adjust this code"); 4400 bind(L); 4401 // must be > 0, no extra check needed here 4402 #endif 4403 4404 // initialize remaining object fields: instance_size was a multiple of 8 4405 { 4406 Label loop; 4407 bind(loop); 4408 int header_size_bytes = oopDesc::header_size() * HeapWordSize; 4409 assert(is_aligned(header_size_bytes, BytesPerLong), "oop header size must be 8-byte-aligned"); 4410 movptr(Address(new_obj, layout_size, Address::times_8, header_size_bytes - 1*oopSize), zero); 4411 decrement(layout_size); 4412 jcc(Assembler::notZero, loop); 4413 } 4414 } // clear_fields 4415 4416 // initialize object header only. 4417 bind(initialize_header); 4418 if (UseCompactObjectHeaders || EnableValhalla) { 4419 pop(klass); 4420 Register mark_word = t2; 4421 movptr(mark_word, Address(klass, Klass::prototype_header_offset())); 4422 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes ()), mark_word); 4423 } else { 4424 movptr(Address(new_obj, oopDesc::mark_offset_in_bytes()), 4425 (intptr_t)markWord::prototype().value()); // header 4426 pop(klass); // get saved klass back in the register. 4427 } 4428 if (!UseCompactObjectHeaders) { 4429 xorl(rsi, rsi); // use zero reg to clear memory (shorter code) 4430 store_klass_gap(new_obj, rsi); // zero klass gap for compressed oops 4431 movptr(t2, klass); // preserve klass 4432 store_klass(new_obj, t2, rscratch1); // src klass reg is potentially compressed 4433 } 4434 jmp(done); 4435 } 4436 4437 bind(slow_case); 4438 pop(klass); 4439 bind(slow_case_no_pop); 4440 jmp(alloc_failed); 4441 4442 bind(done); 4443 } 4444 4445 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4446 void MacroAssembler::tlab_allocate(Register thread, Register obj, 4447 Register var_size_in_bytes, 4448 int con_size_in_bytes, 4449 Register t1, 4450 Register t2, 4451 Label& slow_case) { 4452 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4453 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4454 } 4455 4456 RegSet MacroAssembler::call_clobbered_gp_registers() { 4457 RegSet regs; 4458 #ifdef _LP64 4459 regs += RegSet::of(rax, rcx, rdx); 4460 #ifndef _WINDOWS 4461 regs += RegSet::of(rsi, rdi); 4462 #endif 4463 regs += RegSet::range(r8, r11); 4464 #else 4465 regs += RegSet::of(rax, rcx, rdx); 4466 #endif 4467 #ifdef _LP64 4468 if (UseAPX) { 4469 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1)); 4470 } 4471 #endif 4472 return regs; 4473 } 4474 4475 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() { 4476 int num_xmm_registers = XMMRegister::available_xmm_registers(); 4477 #if defined(_WINDOWS) 4478 XMMRegSet result = XMMRegSet::range(xmm0, xmm5); 4479 if (num_xmm_registers > 16) { 4480 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1)); 4481 } 4482 return result; 4483 #else 4484 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1)); 4485 #endif 4486 } 4487 4488 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor 4489 4490 #ifndef _LP64 4491 static bool use_x87_registers() { return UseSSE < 2; } 4492 #endif 4493 static bool use_xmm_registers() { return UseSSE >= 1; } 4494 4495 // C1 only ever uses the first double/float of the XMM register. 4496 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); } 4497 4498 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 4499 if (UseSSE == 1) { 4500 masm->movflt(Address(rsp, offset), reg); 4501 } else { 4502 masm->movdbl(Address(rsp, offset), reg); 4503 } 4504 } 4505 4506 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 4507 if (UseSSE == 1) { 4508 masm->movflt(reg, Address(rsp, offset)); 4509 } else { 4510 masm->movdbl(reg, Address(rsp, offset)); 4511 } 4512 } 4513 4514 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, 4515 bool save_fpu, int& gp_area_size, 4516 int& fp_area_size, int& xmm_area_size) { 4517 4518 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size, 4519 StackAlignmentInBytes); 4520 #ifdef _LP64 4521 fp_area_size = 0; 4522 #else 4523 fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0; 4524 #endif 4525 xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0; 4526 4527 return gp_area_size + fp_area_size + xmm_area_size; 4528 } 4529 4530 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) { 4531 block_comment("push_call_clobbered_registers start"); 4532 // Regular registers 4533 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude; 4534 4535 int gp_area_size; 4536 int fp_area_size; 4537 int xmm_area_size; 4538 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu, 4539 gp_area_size, fp_area_size, xmm_area_size); 4540 subptr(rsp, total_save_size); 4541 4542 push_set(gp_registers_to_push, 0); 4543 4544 #ifndef _LP64 4545 if (save_fpu && use_x87_registers()) { 4546 fnsave(Address(rsp, gp_area_size)); 4547 fwait(); 4548 } 4549 #endif 4550 if (save_fpu && use_xmm_registers()) { 4551 push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 4552 } 4553 4554 block_comment("push_call_clobbered_registers end"); 4555 } 4556 4557 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) { 4558 block_comment("pop_call_clobbered_registers start"); 4559 4560 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude; 4561 4562 int gp_area_size; 4563 int fp_area_size; 4564 int xmm_area_size; 4565 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu, 4566 gp_area_size, fp_area_size, xmm_area_size); 4567 4568 if (restore_fpu && use_xmm_registers()) { 4569 pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 4570 } 4571 #ifndef _LP64 4572 if (restore_fpu && use_x87_registers()) { 4573 frstor(Address(rsp, gp_area_size)); 4574 } 4575 #endif 4576 4577 pop_set(gp_registers_to_pop, 0); 4578 4579 addptr(rsp, total_save_size); 4580 4581 vzeroupper(); 4582 4583 block_comment("pop_call_clobbered_registers end"); 4584 } 4585 4586 void MacroAssembler::push_set(XMMRegSet set, int offset) { 4587 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be"); 4588 int spill_offset = offset; 4589 4590 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) { 4591 save_xmm_register(this, spill_offset, *it); 4592 spill_offset += xmm_save_size(); 4593 } 4594 } 4595 4596 void MacroAssembler::pop_set(XMMRegSet set, int offset) { 4597 int restore_size = set.size() * xmm_save_size(); 4598 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be"); 4599 4600 int restore_offset = offset + restore_size - xmm_save_size(); 4601 4602 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) { 4603 restore_xmm_register(this, restore_offset, *it); 4604 restore_offset -= xmm_save_size(); 4605 } 4606 } 4607 4608 void MacroAssembler::push_set(RegSet set, int offset) { 4609 int spill_offset; 4610 if (offset == -1) { 4611 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4612 int aligned_size = align_up(register_push_size, StackAlignmentInBytes); 4613 subptr(rsp, aligned_size); 4614 spill_offset = 0; 4615 } else { 4616 spill_offset = offset; 4617 } 4618 4619 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) { 4620 movptr(Address(rsp, spill_offset), *it); 4621 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4622 } 4623 } 4624 4625 void MacroAssembler::pop_set(RegSet set, int offset) { 4626 4627 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4628 int restore_size = set.size() * gp_reg_size; 4629 int aligned_size = align_up(restore_size, StackAlignmentInBytes); 4630 4631 int restore_offset; 4632 if (offset == -1) { 4633 restore_offset = restore_size - gp_reg_size; 4634 } else { 4635 restore_offset = offset + restore_size - gp_reg_size; 4636 } 4637 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) { 4638 movptr(*it, Address(rsp, restore_offset)); 4639 restore_offset -= gp_reg_size; 4640 } 4641 4642 if (offset == -1) { 4643 addptr(rsp, aligned_size); 4644 } 4645 } 4646 4647 // Preserves the contents of address, destroys the contents length_in_bytes and temp. 4648 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) { 4649 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different"); 4650 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord"); 4651 Label done; 4652 4653 testptr(length_in_bytes, length_in_bytes); 4654 jcc(Assembler::zero, done); 4655 4656 // initialize topmost word, divide index by 2, check if odd and test if zero 4657 // note: for the remaining code to work, index must be a multiple of BytesPerWord 4658 #ifdef ASSERT 4659 { 4660 Label L; 4661 testptr(length_in_bytes, BytesPerWord - 1); 4662 jcc(Assembler::zero, L); 4663 stop("length must be a multiple of BytesPerWord"); 4664 bind(L); 4665 } 4666 #endif 4667 Register index = length_in_bytes; 4668 xorptr(temp, temp); // use _zero reg to clear memory (shorter code) 4669 if (UseIncDec) { 4670 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set 4671 } else { 4672 shrptr(index, 2); // use 2 instructions to avoid partial flag stall 4673 shrptr(index, 1); 4674 } 4675 #ifndef _LP64 4676 // index could have not been a multiple of 8 (i.e., bit 2 was set) 4677 { 4678 Label even; 4679 // note: if index was a multiple of 8, then it cannot 4680 // be 0 now otherwise it must have been 0 before 4681 // => if it is even, we don't need to check for 0 again 4682 jcc(Assembler::carryClear, even); 4683 // clear topmost word (no jump would be needed if conditional assignment worked here) 4684 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); 4685 // index could be 0 now, must check again 4686 jcc(Assembler::zero, done); 4687 bind(even); 4688 } 4689 #endif // !_LP64 4690 // initialize remaining object fields: index is a multiple of 2 now 4691 { 4692 Label loop; 4693 bind(loop); 4694 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp); 4695 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);) 4696 decrement(index); 4697 jcc(Assembler::notZero, loop); 4698 } 4699 4700 bind(done); 4701 } 4702 4703 void MacroAssembler::get_inline_type_field_klass(Register holder_klass, Register index, Register inline_klass) { 4704 inline_layout_info(holder_klass, index, inline_klass); 4705 movptr(inline_klass, Address(inline_klass, InlineLayoutInfo::klass_offset())); 4706 } 4707 4708 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) { 4709 movptr(layout_info, Address(holder_klass, InstanceKlass::inline_layout_info_array_offset())); 4710 #ifdef ASSERT 4711 { 4712 Label done; 4713 cmpptr(layout_info, 0); 4714 jcc(Assembler::notEqual, done); 4715 stop("inline_layout_info_array is null"); 4716 bind(done); 4717 } 4718 #endif 4719 4720 InlineLayoutInfo array[2]; 4721 int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements 4722 if (is_power_of_2(size)) { 4723 shll(index, log2i_exact(size)); // Scale index by power of 2 4724 } else { 4725 imull(index, index, size); // Scale the index to be the entry index * array_element_size 4726 } 4727 lea(layout_info, Address(layout_info, index, Address::times_1, Array<InlineLayoutInfo>::base_offset_in_bytes())); 4728 } 4729 4730 // Look up the method for a megamorphic invokeinterface call. 4731 // The target method is determined by <intf_klass, itable_index>. 4732 // The receiver klass is in recv_klass. 4733 // On success, the result will be in method_result, and execution falls through. 4734 // On failure, execution transfers to the given label. 4735 void MacroAssembler::lookup_interface_method(Register recv_klass, 4736 Register intf_klass, 4737 RegisterOrConstant itable_index, 4738 Register method_result, 4739 Register scan_temp, 4740 Label& L_no_such_interface, 4741 bool return_method) { 4742 assert_different_registers(recv_klass, intf_klass, scan_temp); 4743 assert_different_registers(method_result, intf_klass, scan_temp); 4744 assert(recv_klass != method_result || !return_method, 4745 "recv_klass can be destroyed when method isn't needed"); 4746 4747 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 4748 "caller must use same register for non-constant itable index as for method"); 4749 4750 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 4751 int vtable_base = in_bytes(Klass::vtable_start_offset()); 4752 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 4753 int scan_step = itableOffsetEntry::size() * wordSize; 4754 int vte_size = vtableEntry::size_in_bytes(); 4755 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4756 assert(vte_size == wordSize, "else adjust times_vte_scale"); 4757 4758 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 4759 4760 // Could store the aligned, prescaled offset in the klass. 4761 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 4762 4763 if (return_method) { 4764 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 4765 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4766 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 4767 } 4768 4769 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 4770 // if (scan->interface() == intf) { 4771 // result = (klass + scan->offset() + itable_index); 4772 // } 4773 // } 4774 Label search, found_method; 4775 4776 for (int peel = 1; peel >= 0; peel--) { 4777 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset())); 4778 cmpptr(intf_klass, method_result); 4779 4780 if (peel) { 4781 jccb(Assembler::equal, found_method); 4782 } else { 4783 jccb(Assembler::notEqual, search); 4784 // (invert the test to fall through to found_method...) 4785 } 4786 4787 if (!peel) break; 4788 4789 bind(search); 4790 4791 // Check that the previous entry is non-null. A null entry means that 4792 // the receiver class doesn't implement the interface, and wasn't the 4793 // same as when the caller was compiled. 4794 testptr(method_result, method_result); 4795 jcc(Assembler::zero, L_no_such_interface); 4796 addptr(scan_temp, scan_step); 4797 } 4798 4799 bind(found_method); 4800 4801 if (return_method) { 4802 // Got a hit. 4803 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset())); 4804 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); 4805 } 4806 } 4807 4808 // Look up the method for a megamorphic invokeinterface call in a single pass over itable: 4809 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData 4810 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index 4811 // The target method is determined by <holder_klass, itable_index>. 4812 // The receiver klass is in recv_klass. 4813 // On success, the result will be in method_result, and execution falls through. 4814 // On failure, execution transfers to the given label. 4815 void MacroAssembler::lookup_interface_method_stub(Register recv_klass, 4816 Register holder_klass, 4817 Register resolved_klass, 4818 Register method_result, 4819 Register scan_temp, 4820 Register temp_reg2, 4821 Register receiver, 4822 int itable_index, 4823 Label& L_no_such_interface) { 4824 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver); 4825 Register temp_itbl_klass = method_result; 4826 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl 4827 4828 int vtable_base = in_bytes(Klass::vtable_start_offset()); 4829 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 4830 int scan_step = itableOffsetEntry::size() * wordSize; 4831 int vte_size = vtableEntry::size_in_bytes(); 4832 int ioffset = in_bytes(itableOffsetEntry::interface_offset()); 4833 int ooffset = in_bytes(itableOffsetEntry::offset_offset()); 4834 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4835 assert(vte_size == wordSize, "adjust times_vte_scale"); 4836 4837 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found; 4838 4839 // temp_itbl_klass = recv_klass.itable[0] 4840 // scan_temp = &recv_klass.itable[0] + step 4841 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 4842 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset)); 4843 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step)); 4844 xorptr(temp_reg, temp_reg); 4845 4846 // Initial checks: 4847 // - if (holder_klass != resolved_klass), go to "scan for resolved" 4848 // - if (itable[0] == 0), no such interface 4849 // - if (itable[0] == holder_klass), shortcut to "holder found" 4850 cmpptr(holder_klass, resolved_klass); 4851 jccb(Assembler::notEqual, L_loop_scan_resolved_entry); 4852 testptr(temp_itbl_klass, temp_itbl_klass); 4853 jccb(Assembler::zero, L_no_such_interface); 4854 cmpptr(holder_klass, temp_itbl_klass); 4855 jccb(Assembler::equal, L_holder_found); 4856 4857 // Loop: Look for holder_klass record in itable 4858 // do { 4859 // tmp = itable[index]; 4860 // index += step; 4861 // if (tmp == holder_klass) { 4862 // goto L_holder_found; // Found! 4863 // } 4864 // } while (tmp != 0); 4865 // goto L_no_such_interface // Not found. 4866 Label L_scan_holder; 4867 bind(L_scan_holder); 4868 movptr(temp_itbl_klass, Address(scan_temp, 0)); 4869 addptr(scan_temp, scan_step); 4870 cmpptr(holder_klass, temp_itbl_klass); 4871 jccb(Assembler::equal, L_holder_found); 4872 testptr(temp_itbl_klass, temp_itbl_klass); 4873 jccb(Assembler::notZero, L_scan_holder); 4874 4875 jmpb(L_no_such_interface); 4876 4877 // Loop: Look for resolved_class record in itable 4878 // do { 4879 // tmp = itable[index]; 4880 // index += step; 4881 // if (tmp == holder_klass) { 4882 // // Also check if we have met a holder klass 4883 // holder_tmp = itable[index-step-ioffset]; 4884 // } 4885 // if (tmp == resolved_klass) { 4886 // goto L_resolved_found; // Found! 4887 // } 4888 // } while (tmp != 0); 4889 // goto L_no_such_interface // Not found. 4890 // 4891 Label L_loop_scan_resolved; 4892 bind(L_loop_scan_resolved); 4893 movptr(temp_itbl_klass, Address(scan_temp, 0)); 4894 addptr(scan_temp, scan_step); 4895 bind(L_loop_scan_resolved_entry); 4896 cmpptr(holder_klass, temp_itbl_klass); 4897 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step)); 4898 cmpptr(resolved_klass, temp_itbl_klass); 4899 jccb(Assembler::equal, L_resolved_found); 4900 testptr(temp_itbl_klass, temp_itbl_klass); 4901 jccb(Assembler::notZero, L_loop_scan_resolved); 4902 4903 jmpb(L_no_such_interface); 4904 4905 Label L_ready; 4906 4907 // See if we already have a holder klass. If not, go and scan for it. 4908 bind(L_resolved_found); 4909 testptr(temp_reg, temp_reg); 4910 jccb(Assembler::zero, L_scan_holder); 4911 jmpb(L_ready); 4912 4913 bind(L_holder_found); 4914 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step)); 4915 4916 // Finally, temp_reg contains holder_klass vtable offset 4917 bind(L_ready); 4918 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4919 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl 4920 load_klass(scan_temp, receiver, noreg); 4921 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off)); 4922 } else { 4923 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off)); 4924 } 4925 } 4926 4927 4928 // virtual method calling 4929 void MacroAssembler::lookup_virtual_method(Register recv_klass, 4930 RegisterOrConstant vtable_index, 4931 Register method_result) { 4932 const ByteSize base = Klass::vtable_start_offset(); 4933 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); 4934 Address vtable_entry_addr(recv_klass, 4935 vtable_index, Address::times_ptr, 4936 base + vtableEntry::method_offset()); 4937 movptr(method_result, vtable_entry_addr); 4938 } 4939 4940 4941 void MacroAssembler::check_klass_subtype(Register sub_klass, 4942 Register super_klass, 4943 Register temp_reg, 4944 Label& L_success) { 4945 Label L_failure; 4946 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr); 4947 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr); 4948 bind(L_failure); 4949 } 4950 4951 4952 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 4953 Register super_klass, 4954 Register temp_reg, 4955 Label* L_success, 4956 Label* L_failure, 4957 Label* L_slow_path, 4958 RegisterOrConstant super_check_offset) { 4959 assert_different_registers(sub_klass, super_klass, temp_reg); 4960 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 4961 if (super_check_offset.is_register()) { 4962 assert_different_registers(sub_klass, super_klass, 4963 super_check_offset.as_register()); 4964 } else if (must_load_sco) { 4965 assert(temp_reg != noreg, "supply either a temp or a register offset"); 4966 } 4967 4968 Label L_fallthrough; 4969 int label_nulls = 0; 4970 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 4971 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 4972 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 4973 assert(label_nulls <= 1, "at most one null in the batch"); 4974 4975 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4976 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 4977 Address super_check_offset_addr(super_klass, sco_offset); 4978 4979 // Hacked jcc, which "knows" that L_fallthrough, at least, is in 4980 // range of a jccb. If this routine grows larger, reconsider at 4981 // least some of these. 4982 #define local_jcc(assembler_cond, label) \ 4983 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ 4984 else jcc( assembler_cond, label) /*omit semi*/ 4985 4986 // Hacked jmp, which may only be used just before L_fallthrough. 4987 #define final_jmp(label) \ 4988 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 4989 else jmp(label) /*omit semi*/ 4990 4991 // If the pointers are equal, we are done (e.g., String[] elements). 4992 // This self-check enables sharing of secondary supertype arrays among 4993 // non-primary types such as array-of-interface. Otherwise, each such 4994 // type would need its own customized SSA. 4995 // We move this check to the front of the fast path because many 4996 // type checks are in fact trivially successful in this manner, 4997 // so we get a nicely predicted branch right at the start of the check. 4998 cmpptr(sub_klass, super_klass); 4999 local_jcc(Assembler::equal, *L_success); 5000 5001 // Check the supertype display: 5002 if (must_load_sco) { 5003 // Positive movl does right thing on LP64. 5004 movl(temp_reg, super_check_offset_addr); 5005 super_check_offset = RegisterOrConstant(temp_reg); 5006 } 5007 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); 5008 cmpptr(super_klass, super_check_addr); // load displayed supertype 5009 5010 // This check has worked decisively for primary supers. 5011 // Secondary supers are sought in the super_cache ('super_cache_addr'). 5012 // (Secondary supers are interfaces and very deeply nested subtypes.) 5013 // This works in the same check above because of a tricky aliasing 5014 // between the super_cache and the primary super display elements. 5015 // (The 'super_check_addr' can address either, as the case requires.) 5016 // Note that the cache is updated below if it does not help us find 5017 // what we need immediately. 5018 // So if it was a primary super, we can just fail immediately. 5019 // Otherwise, it's the slow path for us (no success at this point). 5020 5021 if (super_check_offset.is_register()) { 5022 local_jcc(Assembler::equal, *L_success); 5023 cmpl(super_check_offset.as_register(), sc_offset); 5024 if (L_failure == &L_fallthrough) { 5025 local_jcc(Assembler::equal, *L_slow_path); 5026 } else { 5027 local_jcc(Assembler::notEqual, *L_failure); 5028 final_jmp(*L_slow_path); 5029 } 5030 } else if (super_check_offset.as_constant() == sc_offset) { 5031 // Need a slow path; fast failure is impossible. 5032 if (L_slow_path == &L_fallthrough) { 5033 local_jcc(Assembler::equal, *L_success); 5034 } else { 5035 local_jcc(Assembler::notEqual, *L_slow_path); 5036 final_jmp(*L_success); 5037 } 5038 } else { 5039 // No slow path; it's a fast decision. 5040 if (L_failure == &L_fallthrough) { 5041 local_jcc(Assembler::equal, *L_success); 5042 } else { 5043 local_jcc(Assembler::notEqual, *L_failure); 5044 final_jmp(*L_success); 5045 } 5046 } 5047 5048 bind(L_fallthrough); 5049 5050 #undef local_jcc 5051 #undef final_jmp 5052 } 5053 5054 5055 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass, 5056 Register super_klass, 5057 Register temp_reg, 5058 Register temp2_reg, 5059 Label* L_success, 5060 Label* L_failure, 5061 bool set_cond_codes) { 5062 assert_different_registers(sub_klass, super_klass, temp_reg); 5063 if (temp2_reg != noreg) 5064 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); 5065 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 5066 5067 Label L_fallthrough; 5068 int label_nulls = 0; 5069 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 5070 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 5071 assert(label_nulls <= 1, "at most one null in the batch"); 5072 5073 // a couple of useful fields in sub_klass: 5074 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 5075 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 5076 Address secondary_supers_addr(sub_klass, ss_offset); 5077 Address super_cache_addr( sub_klass, sc_offset); 5078 5079 // Do a linear scan of the secondary super-klass chain. 5080 // This code is rarely used, so simplicity is a virtue here. 5081 // The repne_scan instruction uses fixed registers, which we must spill. 5082 // Don't worry too much about pre-existing connections with the input regs. 5083 5084 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super) 5085 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter) 5086 5087 // Get super_klass value into rax (even if it was in rdi or rcx). 5088 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; 5089 if (super_klass != rax) { 5090 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } 5091 mov(rax, super_klass); 5092 } 5093 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } 5094 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } 5095 5096 #ifndef PRODUCT 5097 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr; 5098 ExternalAddress pst_counter_addr((address) pst_counter); 5099 NOT_LP64( incrementl(pst_counter_addr) ); 5100 LP64_ONLY( lea(rcx, pst_counter_addr) ); 5101 LP64_ONLY( incrementl(Address(rcx, 0)) ); 5102 #endif //PRODUCT 5103 5104 // We will consult the secondary-super array. 5105 movptr(rdi, secondary_supers_addr); 5106 // Load the array length. (Positive movl does right thing on LP64.) 5107 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); 5108 // Skip to start of data. 5109 addptr(rdi, Array<Klass*>::base_offset_in_bytes()); 5110 5111 // Scan RCX words at [RDI] for an occurrence of RAX. 5112 // Set NZ/Z based on last compare. 5113 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 5114 // not change flags (only scas instruction which is repeated sets flags). 5115 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 5116 5117 testptr(rax,rax); // Set Z = 0 5118 repne_scan(); 5119 5120 // Unspill the temp. registers: 5121 if (pushed_rdi) pop(rdi); 5122 if (pushed_rcx) pop(rcx); 5123 if (pushed_rax) pop(rax); 5124 5125 if (set_cond_codes) { 5126 // Special hack for the AD files: rdi is guaranteed non-zero. 5127 assert(!pushed_rdi, "rdi must be left non-null"); 5128 // Also, the condition codes are properly set Z/NZ on succeed/failure. 5129 } 5130 5131 if (L_failure == &L_fallthrough) 5132 jccb(Assembler::notEqual, *L_failure); 5133 else jcc(Assembler::notEqual, *L_failure); 5134 5135 // Success. Cache the super we found and proceed in triumph. 5136 movptr(super_cache_addr, super_klass); 5137 5138 if (L_success != &L_fallthrough) { 5139 jmp(*L_success); 5140 } 5141 5142 #undef IS_A_TEMP 5143 5144 bind(L_fallthrough); 5145 } 5146 5147 #ifndef _LP64 5148 5149 // 32-bit x86 only: always use the linear search. 5150 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 5151 Register super_klass, 5152 Register temp_reg, 5153 Register temp2_reg, 5154 Label* L_success, 5155 Label* L_failure, 5156 bool set_cond_codes) { 5157 check_klass_subtype_slow_path_linear 5158 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, set_cond_codes); 5159 } 5160 5161 #else // _LP64 5162 5163 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 5164 Register super_klass, 5165 Register temp_reg, 5166 Register temp2_reg, 5167 Label* L_success, 5168 Label* L_failure, 5169 bool set_cond_codes) { 5170 assert(set_cond_codes == false, "must be false on 64-bit x86"); 5171 check_klass_subtype_slow_path 5172 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg, 5173 L_success, L_failure); 5174 } 5175 5176 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 5177 Register super_klass, 5178 Register temp_reg, 5179 Register temp2_reg, 5180 Register temp3_reg, 5181 Register temp4_reg, 5182 Label* L_success, 5183 Label* L_failure) { 5184 if (UseSecondarySupersTable) { 5185 check_klass_subtype_slow_path_table 5186 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg, 5187 L_success, L_failure); 5188 } else { 5189 check_klass_subtype_slow_path_linear 5190 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false); 5191 } 5192 } 5193 5194 Register MacroAssembler::allocate_if_noreg(Register r, 5195 RegSetIterator<Register> &available_regs, 5196 RegSet ®s_to_push) { 5197 if (!r->is_valid()) { 5198 r = *available_regs++; 5199 regs_to_push += r; 5200 } 5201 return r; 5202 } 5203 5204 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass, 5205 Register super_klass, 5206 Register temp_reg, 5207 Register temp2_reg, 5208 Register temp3_reg, 5209 Register result_reg, 5210 Label* L_success, 5211 Label* L_failure) { 5212 // NB! Callers may assume that, when temp2_reg is a valid register, 5213 // this code sets it to a nonzero value. 5214 bool temp2_reg_was_valid = temp2_reg->is_valid(); 5215 5216 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg); 5217 5218 Label L_fallthrough; 5219 int label_nulls = 0; 5220 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 5221 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 5222 assert(label_nulls <= 1, "at most one null in the batch"); 5223 5224 BLOCK_COMMENT("check_klass_subtype_slow_path_table"); 5225 5226 RegSetIterator<Register> available_regs 5227 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin(); 5228 5229 RegSet pushed_regs; 5230 5231 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs); 5232 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs); 5233 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs); 5234 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs); 5235 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs); 5236 5237 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg); 5238 5239 { 5240 5241 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size; 5242 int aligned_size = align_up(register_push_size, StackAlignmentInBytes); 5243 subptr(rsp, aligned_size); 5244 push_set(pushed_regs, 0); 5245 5246 lookup_secondary_supers_table_var(sub_klass, 5247 super_klass, 5248 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg); 5249 cmpq(result_reg, 0); 5250 5251 // Unspill the temp. registers: 5252 pop_set(pushed_regs, 0); 5253 // Increment SP but do not clobber flags. 5254 lea(rsp, Address(rsp, aligned_size)); 5255 } 5256 5257 if (temp2_reg_was_valid) { 5258 movq(temp2_reg, 1); 5259 } 5260 5261 jcc(Assembler::notEqual, *L_failure); 5262 5263 if (L_success != &L_fallthrough) { 5264 jmp(*L_success); 5265 } 5266 5267 bind(L_fallthrough); 5268 } 5269 5270 // population_count variant for running without the POPCNT 5271 // instruction, which was introduced with SSE4.2 in 2008. 5272 void MacroAssembler::population_count(Register dst, Register src, 5273 Register scratch1, Register scratch2) { 5274 assert_different_registers(src, scratch1, scratch2); 5275 if (UsePopCountInstruction) { 5276 Assembler::popcntq(dst, src); 5277 } else { 5278 assert_different_registers(src, scratch1, scratch2); 5279 assert_different_registers(dst, scratch1, scratch2); 5280 Label loop, done; 5281 5282 mov(scratch1, src); 5283 // dst = 0; 5284 // while(scratch1 != 0) { 5285 // dst++; 5286 // scratch1 &= (scratch1 - 1); 5287 // } 5288 xorl(dst, dst); 5289 testq(scratch1, scratch1); 5290 jccb(Assembler::equal, done); 5291 { 5292 bind(loop); 5293 incq(dst); 5294 movq(scratch2, scratch1); 5295 decq(scratch2); 5296 andq(scratch1, scratch2); 5297 jccb(Assembler::notEqual, loop); 5298 } 5299 bind(done); 5300 } 5301 #ifdef ASSERT 5302 mov64(scratch1, 0xCafeBabeDeadBeef); 5303 movq(scratch2, scratch1); 5304 #endif 5305 } 5306 5307 // Ensure that the inline code and the stub are using the same registers. 5308 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 5309 do { \ 5310 assert(r_super_klass == rax, "mismatch"); \ 5311 assert(r_array_base == rbx, "mismatch"); \ 5312 assert(r_array_length == rcx, "mismatch"); \ 5313 assert(r_array_index == rdx, "mismatch"); \ 5314 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \ 5315 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \ 5316 assert(result == rdi || result == noreg, "mismatch"); \ 5317 } while(0) 5318 5319 // Versions of salq and rorq that don't need count to be in rcx 5320 5321 void MacroAssembler::salq(Register dest, Register count) { 5322 if (count == rcx) { 5323 Assembler::salq(dest); 5324 } else { 5325 assert_different_registers(rcx, dest); 5326 xchgq(rcx, count); 5327 Assembler::salq(dest); 5328 xchgq(rcx, count); 5329 } 5330 } 5331 5332 void MacroAssembler::rorq(Register dest, Register count) { 5333 if (count == rcx) { 5334 Assembler::rorq(dest); 5335 } else { 5336 assert_different_registers(rcx, dest); 5337 xchgq(rcx, count); 5338 Assembler::rorq(dest); 5339 xchgq(rcx, count); 5340 } 5341 } 5342 5343 // Return true: we succeeded in generating this code 5344 // 5345 // At runtime, return 0 in result if r_super_klass is a superclass of 5346 // r_sub_klass, otherwise return nonzero. Use this if you know the 5347 // super_klass_slot of the class you're looking for. This is always 5348 // the case for instanceof and checkcast. 5349 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass, 5350 Register r_super_klass, 5351 Register temp1, 5352 Register temp2, 5353 Register temp3, 5354 Register temp4, 5355 Register result, 5356 u1 super_klass_slot) { 5357 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 5358 5359 Label L_fallthrough, L_success, L_failure; 5360 5361 BLOCK_COMMENT("lookup_secondary_supers_table {"); 5362 5363 const Register 5364 r_array_index = temp1, 5365 r_array_length = temp2, 5366 r_array_base = temp3, 5367 r_bitmap = temp4; 5368 5369 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 5370 5371 xorq(result, result); // = 0 5372 5373 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset())); 5374 movq(r_array_index, r_bitmap); 5375 5376 // First check the bitmap to see if super_klass might be present. If 5377 // the bit is zero, we are certain that super_klass is not one of 5378 // the secondary supers. 5379 u1 bit = super_klass_slot; 5380 { 5381 // NB: If the count in a x86 shift instruction is 0, the flags are 5382 // not affected, so we do a testq instead. 5383 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 5384 if (shift_count != 0) { 5385 salq(r_array_index, shift_count); 5386 } else { 5387 testq(r_array_index, r_array_index); 5388 } 5389 } 5390 // We test the MSB of r_array_index, i.e. its sign bit 5391 jcc(Assembler::positive, L_failure); 5392 5393 // Get the first array index that can contain super_klass into r_array_index. 5394 if (bit != 0) { 5395 population_count(r_array_index, r_array_index, temp2, temp3); 5396 } else { 5397 movl(r_array_index, 1); 5398 } 5399 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 5400 5401 // We will consult the secondary-super array. 5402 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset()))); 5403 5404 // We're asserting that the first word in an Array<Klass*> is the 5405 // length, and the second word is the first word of the data. If 5406 // that ever changes, r_array_base will have to be adjusted here. 5407 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 5408 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code"); 5409 5410 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8)); 5411 jccb(Assembler::equal, L_success); 5412 5413 // Is there another entry to check? Consult the bitmap. 5414 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 5415 jccb(Assembler::carryClear, L_failure); 5416 5417 // Linear probe. Rotate the bitmap so that the next bit to test is 5418 // in Bit 1. 5419 if (bit != 0) { 5420 rorq(r_bitmap, bit); 5421 } 5422 5423 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 5424 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 5425 // Kills: r_array_length. 5426 // Returns: result. 5427 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub())); 5428 // Result (0/1) is in rdi 5429 jmpb(L_fallthrough); 5430 5431 bind(L_failure); 5432 incq(result); // 0 => 1 5433 5434 bind(L_success); 5435 // result = 0; 5436 5437 bind(L_fallthrough); 5438 BLOCK_COMMENT("} lookup_secondary_supers_table"); 5439 5440 if (VerifySecondarySupers) { 5441 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 5442 temp1, temp2, temp3); 5443 } 5444 } 5445 5446 // At runtime, return 0 in result if r_super_klass is a superclass of 5447 // r_sub_klass, otherwise return nonzero. Use this version of 5448 // lookup_secondary_supers_table() if you don't know ahead of time 5449 // which superclass will be searched for. Used by interpreter and 5450 // runtime stubs. It is larger and has somewhat greater latency than 5451 // the version above, which takes a constant super_klass_slot. 5452 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass, 5453 Register r_super_klass, 5454 Register temp1, 5455 Register temp2, 5456 Register temp3, 5457 Register temp4, 5458 Register result) { 5459 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 5460 assert_different_registers(r_sub_klass, r_super_klass, rcx); 5461 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4); 5462 5463 Label L_fallthrough, L_success, L_failure; 5464 5465 BLOCK_COMMENT("lookup_secondary_supers_table {"); 5466 5467 RegSetIterator<Register> available_regs = (temps - rcx).begin(); 5468 5469 // FIXME. Once we are sure that all paths reaching this point really 5470 // do pass rcx as one of our temps we can get rid of the following 5471 // workaround. 5472 assert(temps.contains(rcx), "fix this code"); 5473 5474 // We prefer to have our shift count in rcx. If rcx is one of our 5475 // temps, use it for slot. If not, pick any of our temps. 5476 Register slot; 5477 if (!temps.contains(rcx)) { 5478 slot = *available_regs++; 5479 } else { 5480 slot = rcx; 5481 } 5482 5483 const Register r_array_index = *available_regs++; 5484 const Register r_bitmap = *available_regs++; 5485 5486 // The logic above guarantees this property, but we state it here. 5487 assert_different_registers(r_array_index, r_bitmap, rcx); 5488 5489 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset())); 5490 movq(r_array_index, r_bitmap); 5491 5492 // First check the bitmap to see if super_klass might be present. If 5493 // the bit is zero, we are certain that super_klass is not one of 5494 // the secondary supers. 5495 movb(slot, Address(r_super_klass, Klass::hash_slot_offset())); 5496 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64) 5497 salq(r_array_index, slot); 5498 5499 testq(r_array_index, r_array_index); 5500 // We test the MSB of r_array_index, i.e. its sign bit 5501 jcc(Assembler::positive, L_failure); 5502 5503 const Register r_array_base = *available_regs++; 5504 5505 // Get the first array index that can contain super_klass into r_array_index. 5506 // Note: Clobbers r_array_base and slot. 5507 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot); 5508 5509 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 5510 5511 // We will consult the secondary-super array. 5512 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset()))); 5513 5514 // We're asserting that the first word in an Array<Klass*> is the 5515 // length, and the second word is the first word of the data. If 5516 // that ever changes, r_array_base will have to be adjusted here. 5517 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 5518 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code"); 5519 5520 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8)); 5521 jccb(Assembler::equal, L_success); 5522 5523 // Restore slot to its true value 5524 movb(slot, Address(r_super_klass, Klass::hash_slot_offset())); 5525 5526 // Linear probe. Rotate the bitmap so that the next bit to test is 5527 // in Bit 1. 5528 rorq(r_bitmap, slot); 5529 5530 // Is there another entry to check? Consult the bitmap. 5531 btq(r_bitmap, 1); 5532 jccb(Assembler::carryClear, L_failure); 5533 5534 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 5535 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 5536 // Kills: r_array_length. 5537 // Returns: result. 5538 lookup_secondary_supers_table_slow_path(r_super_klass, 5539 r_array_base, 5540 r_array_index, 5541 r_bitmap, 5542 /*temp1*/result, 5543 /*temp2*/slot, 5544 &L_success, 5545 nullptr); 5546 5547 bind(L_failure); 5548 movq(result, 1); 5549 jmpb(L_fallthrough); 5550 5551 bind(L_success); 5552 xorq(result, result); // = 0 5553 5554 bind(L_fallthrough); 5555 BLOCK_COMMENT("} lookup_secondary_supers_table"); 5556 5557 if (VerifySecondarySupers) { 5558 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 5559 temp1, temp2, temp3); 5560 } 5561 } 5562 5563 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit, 5564 Label* L_success, Label* L_failure) { 5565 Label L_loop, L_fallthrough; 5566 { 5567 int label_nulls = 0; 5568 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 5569 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 5570 assert(label_nulls <= 1, "at most one null in the batch"); 5571 } 5572 bind(L_loop); 5573 cmpq(value, Address(addr, count, Address::times_8)); 5574 jcc(Assembler::equal, *L_success); 5575 addl(count, 1); 5576 cmpl(count, limit); 5577 jcc(Assembler::less, L_loop); 5578 5579 if (&L_fallthrough != L_failure) { 5580 jmp(*L_failure); 5581 } 5582 bind(L_fallthrough); 5583 } 5584 5585 // Called by code generated by check_klass_subtype_slow_path 5586 // above. This is called when there is a collision in the hashed 5587 // lookup in the secondary supers array. 5588 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 5589 Register r_array_base, 5590 Register r_array_index, 5591 Register r_bitmap, 5592 Register temp1, 5593 Register temp2, 5594 Label* L_success, 5595 Label* L_failure) { 5596 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2); 5597 5598 const Register 5599 r_array_length = temp1, 5600 r_sub_klass = noreg, 5601 result = noreg; 5602 5603 Label L_fallthrough; 5604 int label_nulls = 0; 5605 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 5606 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 5607 assert(label_nulls <= 1, "at most one null in the batch"); 5608 5609 // Load the array length. 5610 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes())); 5611 // And adjust the array base to point to the data. 5612 // NB! Effectively increments current slot index by 1. 5613 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 5614 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes()); 5615 5616 // Linear probe 5617 Label L_huge; 5618 5619 // The bitmap is full to bursting. 5620 // Implicit invariant: BITMAP_FULL implies (length > 0) 5621 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2); 5622 jcc(Assembler::greater, L_huge); 5623 5624 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 5625 // current slot (at secondary_supers[r_array_index]) has not yet 5626 // been inspected, and r_array_index may be out of bounds if we 5627 // wrapped around the end of the array. 5628 5629 { // This is conventional linear probing, but instead of terminating 5630 // when a null entry is found in the table, we maintain a bitmap 5631 // in which a 0 indicates missing entries. 5632 // The check above guarantees there are 0s in the bitmap, so the loop 5633 // eventually terminates. 5634 5635 xorl(temp2, temp2); // = 0; 5636 5637 Label L_again; 5638 bind(L_again); 5639 5640 // Check for array wraparound. 5641 cmpl(r_array_index, r_array_length); 5642 cmovl(Assembler::greaterEqual, r_array_index, temp2); 5643 5644 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8)); 5645 jcc(Assembler::equal, *L_success); 5646 5647 // If the next bit in bitmap is zero, we're done. 5648 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now 5649 jcc(Assembler::carryClear, *L_failure); 5650 5651 rorq(r_bitmap, 1); // Bits 1/2 => 0/1 5652 addl(r_array_index, 1); 5653 5654 jmp(L_again); 5655 } 5656 5657 { // Degenerate case: more than 64 secondary supers. 5658 // FIXME: We could do something smarter here, maybe a vectorized 5659 // comparison or a binary search, but is that worth any added 5660 // complexity? 5661 bind(L_huge); 5662 xorl(r_array_index, r_array_index); // = 0 5663 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, 5664 L_success, 5665 (&L_fallthrough != L_failure ? L_failure : nullptr)); 5666 5667 bind(L_fallthrough); 5668 } 5669 } 5670 5671 struct VerifyHelperArguments { 5672 Klass* _super; 5673 Klass* _sub; 5674 intptr_t _linear_result; 5675 intptr_t _table_result; 5676 }; 5677 5678 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) { 5679 Klass::on_secondary_supers_verification_failure(args->_super, 5680 args->_sub, 5681 args->_linear_result, 5682 args->_table_result, 5683 msg); 5684 } 5685 5686 // Make sure that the hashed lookup and a linear scan agree. 5687 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 5688 Register r_super_klass, 5689 Register result, 5690 Register temp1, 5691 Register temp2, 5692 Register temp3) { 5693 const Register 5694 r_array_index = temp1, 5695 r_array_length = temp2, 5696 r_array_base = temp3, 5697 r_bitmap = noreg; 5698 5699 BLOCK_COMMENT("verify_secondary_supers_table {"); 5700 5701 Label L_success, L_failure, L_check, L_done; 5702 5703 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset()))); 5704 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes())); 5705 // And adjust the array base to point to the data. 5706 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes()); 5707 5708 testl(r_array_length, r_array_length); // array_length == 0? 5709 jcc(Assembler::zero, L_failure); 5710 5711 movl(r_array_index, 0); 5712 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success); 5713 // fall through to L_failure 5714 5715 const Register linear_result = r_array_index; // reuse temp1 5716 5717 bind(L_failure); // not present 5718 movl(linear_result, 1); 5719 jmp(L_check); 5720 5721 bind(L_success); // present 5722 movl(linear_result, 0); 5723 5724 bind(L_check); 5725 cmpl(linear_result, result); 5726 jcc(Assembler::equal, L_done); 5727 5728 { // To avoid calling convention issues, build a record on the stack 5729 // and pass the pointer to that instead. 5730 push(result); 5731 push(linear_result); 5732 push(r_sub_klass); 5733 push(r_super_klass); 5734 movptr(c_rarg1, rsp); 5735 movptr(c_rarg0, (uintptr_t) "mismatch"); 5736 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper))); 5737 should_not_reach_here(); 5738 } 5739 bind(L_done); 5740 5741 BLOCK_COMMENT("} verify_secondary_supers_table"); 5742 } 5743 5744 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS 5745 5746 #endif // LP64 5747 5748 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 5749 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 5750 5751 Label L_fallthrough; 5752 if (L_fast_path == nullptr) { 5753 L_fast_path = &L_fallthrough; 5754 } else if (L_slow_path == nullptr) { 5755 L_slow_path = &L_fallthrough; 5756 } 5757 5758 // Fast path check: class is fully initialized. 5759 // init_state needs acquire, but x86 is TSO, and so we are already good. 5760 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized); 5761 jcc(Assembler::equal, *L_fast_path); 5762 5763 // Fast path check: current thread is initializer thread 5764 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset())); 5765 if (L_slow_path == &L_fallthrough) { 5766 jcc(Assembler::equal, *L_fast_path); 5767 bind(*L_slow_path); 5768 } else if (L_fast_path == &L_fallthrough) { 5769 jcc(Assembler::notEqual, *L_slow_path); 5770 bind(*L_fast_path); 5771 } else { 5772 Unimplemented(); 5773 } 5774 } 5775 5776 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { 5777 if (VM_Version::supports_cmov()) { 5778 cmovl(cc, dst, src); 5779 } else { 5780 Label L; 5781 jccb(negate_condition(cc), L); 5782 movl(dst, src); 5783 bind(L); 5784 } 5785 } 5786 5787 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { 5788 if (VM_Version::supports_cmov()) { 5789 cmovl(cc, dst, src); 5790 } else { 5791 Label L; 5792 jccb(negate_condition(cc), L); 5793 movl(dst, src); 5794 bind(L); 5795 } 5796 } 5797 5798 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) { 5799 if (!VerifyOops || VerifyAdapterSharing) { 5800 // Below address of the code string confuses VerifyAdapterSharing 5801 // because it may differ between otherwise equivalent adapters. 5802 return; 5803 } 5804 5805 BLOCK_COMMENT("verify_oop {"); 5806 #ifdef _LP64 5807 push(rscratch1); 5808 #endif 5809 push(rax); // save rax 5810 push(reg); // pass register argument 5811 5812 // Pass register number to verify_oop_subroutine 5813 const char* b = nullptr; 5814 { 5815 ResourceMark rm; 5816 stringStream ss; 5817 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line); 5818 b = code_string(ss.as_string()); 5819 } 5820 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate()); 5821 pushptr(buffer.addr(), rscratch1); 5822 5823 // call indirectly to solve generation ordering problem 5824 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 5825 call(rax); 5826 // Caller pops the arguments (oop, message) and restores rax, r10 5827 BLOCK_COMMENT("} verify_oop"); 5828 } 5829 5830 void MacroAssembler::vallones(XMMRegister dst, int vector_len) { 5831 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 5832 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without 5833 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog 5834 vpternlogd(dst, 0xFF, dst, dst, vector_len); 5835 } else if (VM_Version::supports_avx()) { 5836 vpcmpeqd(dst, dst, dst, vector_len); 5837 } else { 5838 pcmpeqd(dst, dst); 5839 } 5840 } 5841 5842 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 5843 int extra_slot_offset) { 5844 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 5845 int stackElementSize = Interpreter::stackElementSize; 5846 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 5847 #ifdef ASSERT 5848 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 5849 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 5850 #endif 5851 Register scale_reg = noreg; 5852 Address::ScaleFactor scale_factor = Address::no_scale; 5853 if (arg_slot.is_constant()) { 5854 offset += arg_slot.as_constant() * stackElementSize; 5855 } else { 5856 scale_reg = arg_slot.as_register(); 5857 scale_factor = Address::times(stackElementSize); 5858 } 5859 offset += wordSize; // return PC is on stack 5860 return Address(rsp, scale_reg, scale_factor, offset); 5861 } 5862 5863 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) { 5864 if (!VerifyOops || VerifyAdapterSharing) { 5865 // Below address of the code string confuses VerifyAdapterSharing 5866 // because it may differ between otherwise equivalent adapters. 5867 return; 5868 } 5869 5870 #ifdef _LP64 5871 push(rscratch1); 5872 #endif 5873 push(rax); // save rax, 5874 // addr may contain rsp so we will have to adjust it based on the push 5875 // we just did (and on 64 bit we do two pushes) 5876 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 5877 // stores rax into addr which is backwards of what was intended. 5878 if (addr.uses(rsp)) { 5879 lea(rax, addr); 5880 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); 5881 } else { 5882 pushptr(addr); 5883 } 5884 5885 // Pass register number to verify_oop_subroutine 5886 const char* b = nullptr; 5887 { 5888 ResourceMark rm; 5889 stringStream ss; 5890 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line); 5891 b = code_string(ss.as_string()); 5892 } 5893 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate()); 5894 pushptr(buffer.addr(), rscratch1); 5895 5896 // call indirectly to solve generation ordering problem 5897 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 5898 call(rax); 5899 // Caller pops the arguments (addr, message) and restores rax, r10. 5900 } 5901 5902 void MacroAssembler::verify_tlab() { 5903 #ifdef ASSERT 5904 if (UseTLAB && VerifyOops) { 5905 Label next, ok; 5906 Register t1 = rsi; 5907 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); 5908 5909 push(t1); 5910 NOT_LP64(push(thread_reg)); 5911 NOT_LP64(get_thread(thread_reg)); 5912 5913 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 5914 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 5915 jcc(Assembler::aboveEqual, next); 5916 STOP("assert(top >= start)"); 5917 should_not_reach_here(); 5918 5919 bind(next); 5920 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 5921 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 5922 jcc(Assembler::aboveEqual, ok); 5923 STOP("assert(top <= end)"); 5924 should_not_reach_here(); 5925 5926 bind(ok); 5927 NOT_LP64(pop(thread_reg)); 5928 pop(t1); 5929 } 5930 #endif 5931 } 5932 5933 class ControlWord { 5934 public: 5935 int32_t _value; 5936 5937 int rounding_control() const { return (_value >> 10) & 3 ; } 5938 int precision_control() const { return (_value >> 8) & 3 ; } 5939 bool precision() const { return ((_value >> 5) & 1) != 0; } 5940 bool underflow() const { return ((_value >> 4) & 1) != 0; } 5941 bool overflow() const { return ((_value >> 3) & 1) != 0; } 5942 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 5943 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 5944 bool invalid() const { return ((_value >> 0) & 1) != 0; } 5945 5946 void print() const { 5947 // rounding control 5948 const char* rc; 5949 switch (rounding_control()) { 5950 case 0: rc = "round near"; break; 5951 case 1: rc = "round down"; break; 5952 case 2: rc = "round up "; break; 5953 case 3: rc = "chop "; break; 5954 default: 5955 rc = nullptr; // silence compiler warnings 5956 fatal("Unknown rounding control: %d", rounding_control()); 5957 }; 5958 // precision control 5959 const char* pc; 5960 switch (precision_control()) { 5961 case 0: pc = "24 bits "; break; 5962 case 1: pc = "reserved"; break; 5963 case 2: pc = "53 bits "; break; 5964 case 3: pc = "64 bits "; break; 5965 default: 5966 pc = nullptr; // silence compiler warnings 5967 fatal("Unknown precision control: %d", precision_control()); 5968 }; 5969 // flags 5970 char f[9]; 5971 f[0] = ' '; 5972 f[1] = ' '; 5973 f[2] = (precision ()) ? 'P' : 'p'; 5974 f[3] = (underflow ()) ? 'U' : 'u'; 5975 f[4] = (overflow ()) ? 'O' : 'o'; 5976 f[5] = (zero_divide ()) ? 'Z' : 'z'; 5977 f[6] = (denormalized()) ? 'D' : 'd'; 5978 f[7] = (invalid ()) ? 'I' : 'i'; 5979 f[8] = '\x0'; 5980 // output 5981 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc); 5982 } 5983 5984 }; 5985 5986 class StatusWord { 5987 public: 5988 int32_t _value; 5989 5990 bool busy() const { return ((_value >> 15) & 1) != 0; } 5991 bool C3() const { return ((_value >> 14) & 1) != 0; } 5992 bool C2() const { return ((_value >> 10) & 1) != 0; } 5993 bool C1() const { return ((_value >> 9) & 1) != 0; } 5994 bool C0() const { return ((_value >> 8) & 1) != 0; } 5995 int top() const { return (_value >> 11) & 7 ; } 5996 bool error_status() const { return ((_value >> 7) & 1) != 0; } 5997 bool stack_fault() const { return ((_value >> 6) & 1) != 0; } 5998 bool precision() const { return ((_value >> 5) & 1) != 0; } 5999 bool underflow() const { return ((_value >> 4) & 1) != 0; } 6000 bool overflow() const { return ((_value >> 3) & 1) != 0; } 6001 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 6002 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 6003 bool invalid() const { return ((_value >> 0) & 1) != 0; } 6004 6005 void print() const { 6006 // condition codes 6007 char c[5]; 6008 c[0] = (C3()) ? '3' : '-'; 6009 c[1] = (C2()) ? '2' : '-'; 6010 c[2] = (C1()) ? '1' : '-'; 6011 c[3] = (C0()) ? '0' : '-'; 6012 c[4] = '\x0'; 6013 // flags 6014 char f[9]; 6015 f[0] = (error_status()) ? 'E' : '-'; 6016 f[1] = (stack_fault ()) ? 'S' : '-'; 6017 f[2] = (precision ()) ? 'P' : '-'; 6018 f[3] = (underflow ()) ? 'U' : '-'; 6019 f[4] = (overflow ()) ? 'O' : '-'; 6020 f[5] = (zero_divide ()) ? 'Z' : '-'; 6021 f[6] = (denormalized()) ? 'D' : '-'; 6022 f[7] = (invalid ()) ? 'I' : '-'; 6023 f[8] = '\x0'; 6024 // output 6025 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top()); 6026 } 6027 6028 }; 6029 6030 class TagWord { 6031 public: 6032 int32_t _value; 6033 6034 int tag_at(int i) const { return (_value >> (i*2)) & 3; } 6035 6036 void print() const { 6037 printf("%04x", _value & 0xFFFF); 6038 } 6039 6040 }; 6041 6042 class FPU_Register { 6043 public: 6044 int32_t _m0; 6045 int32_t _m1; 6046 int16_t _ex; 6047 6048 bool is_indefinite() const { 6049 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; 6050 } 6051 6052 void print() const { 6053 char sign = (_ex < 0) ? '-' : '+'; 6054 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " "; 6055 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind); 6056 }; 6057 6058 }; 6059 6060 class FPU_State { 6061 public: 6062 enum { 6063 register_size = 10, 6064 number_of_registers = 8, 6065 register_mask = 7 6066 }; 6067 6068 ControlWord _control_word; 6069 StatusWord _status_word; 6070 TagWord _tag_word; 6071 int32_t _error_offset; 6072 int32_t _error_selector; 6073 int32_t _data_offset; 6074 int32_t _data_selector; 6075 int8_t _register[register_size * number_of_registers]; 6076 6077 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } 6078 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } 6079 6080 const char* tag_as_string(int tag) const { 6081 switch (tag) { 6082 case 0: return "valid"; 6083 case 1: return "zero"; 6084 case 2: return "special"; 6085 case 3: return "empty"; 6086 } 6087 ShouldNotReachHere(); 6088 return nullptr; 6089 } 6090 6091 void print() const { 6092 // print computation registers 6093 { int t = _status_word.top(); 6094 for (int i = 0; i < number_of_registers; i++) { 6095 int j = (i - t) & register_mask; 6096 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j); 6097 st(j)->print(); 6098 printf(" %s\n", tag_as_string(_tag_word.tag_at(i))); 6099 } 6100 } 6101 printf("\n"); 6102 // print control registers 6103 printf("ctrl = "); _control_word.print(); printf("\n"); 6104 printf("stat = "); _status_word .print(); printf("\n"); 6105 printf("tags = "); _tag_word .print(); printf("\n"); 6106 } 6107 6108 }; 6109 6110 class Flag_Register { 6111 public: 6112 int32_t _value; 6113 6114 bool overflow() const { return ((_value >> 11) & 1) != 0; } 6115 bool direction() const { return ((_value >> 10) & 1) != 0; } 6116 bool sign() const { return ((_value >> 7) & 1) != 0; } 6117 bool zero() const { return ((_value >> 6) & 1) != 0; } 6118 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } 6119 bool parity() const { return ((_value >> 2) & 1) != 0; } 6120 bool carry() const { return ((_value >> 0) & 1) != 0; } 6121 6122 void print() const { 6123 // flags 6124 char f[8]; 6125 f[0] = (overflow ()) ? 'O' : '-'; 6126 f[1] = (direction ()) ? 'D' : '-'; 6127 f[2] = (sign ()) ? 'S' : '-'; 6128 f[3] = (zero ()) ? 'Z' : '-'; 6129 f[4] = (auxiliary_carry()) ? 'A' : '-'; 6130 f[5] = (parity ()) ? 'P' : '-'; 6131 f[6] = (carry ()) ? 'C' : '-'; 6132 f[7] = '\x0'; 6133 // output 6134 printf("%08x flags = %s", _value, f); 6135 } 6136 6137 }; 6138 6139 class IU_Register { 6140 public: 6141 int32_t _value; 6142 6143 void print() const { 6144 printf("%08x %11d", _value, _value); 6145 } 6146 6147 }; 6148 6149 class IU_State { 6150 public: 6151 Flag_Register _eflags; 6152 IU_Register _rdi; 6153 IU_Register _rsi; 6154 IU_Register _rbp; 6155 IU_Register _rsp; 6156 IU_Register _rbx; 6157 IU_Register _rdx; 6158 IU_Register _rcx; 6159 IU_Register _rax; 6160 6161 void print() const { 6162 // computation registers 6163 printf("rax, = "); _rax.print(); printf("\n"); 6164 printf("rbx, = "); _rbx.print(); printf("\n"); 6165 printf("rcx = "); _rcx.print(); printf("\n"); 6166 printf("rdx = "); _rdx.print(); printf("\n"); 6167 printf("rdi = "); _rdi.print(); printf("\n"); 6168 printf("rsi = "); _rsi.print(); printf("\n"); 6169 printf("rbp, = "); _rbp.print(); printf("\n"); 6170 printf("rsp = "); _rsp.print(); printf("\n"); 6171 printf("\n"); 6172 // control registers 6173 printf("flgs = "); _eflags.print(); printf("\n"); 6174 } 6175 }; 6176 6177 6178 class CPU_State { 6179 public: 6180 FPU_State _fpu_state; 6181 IU_State _iu_state; 6182 6183 void print() const { 6184 printf("--------------------------------------------------\n"); 6185 _iu_state .print(); 6186 printf("\n"); 6187 _fpu_state.print(); 6188 printf("--------------------------------------------------\n"); 6189 } 6190 6191 }; 6192 6193 6194 static void _print_CPU_state(CPU_State* state) { 6195 state->print(); 6196 }; 6197 6198 6199 void MacroAssembler::print_CPU_state() { 6200 push_CPU_state(); 6201 push(rsp); // pass CPU state 6202 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); 6203 addptr(rsp, wordSize); // discard argument 6204 pop_CPU_state(); 6205 } 6206 6207 6208 #ifndef _LP64 6209 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { 6210 static int counter = 0; 6211 FPU_State* fs = &state->_fpu_state; 6212 counter++; 6213 // For leaf calls, only verify that the top few elements remain empty. 6214 // We only need 1 empty at the top for C2 code. 6215 if( stack_depth < 0 ) { 6216 if( fs->tag_for_st(7) != 3 ) { 6217 printf("FPR7 not empty\n"); 6218 state->print(); 6219 assert(false, "error"); 6220 return false; 6221 } 6222 return true; // All other stack states do not matter 6223 } 6224 6225 assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(), 6226 "bad FPU control word"); 6227 6228 // compute stack depth 6229 int i = 0; 6230 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; 6231 int d = i; 6232 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; 6233 // verify findings 6234 if (i != FPU_State::number_of_registers) { 6235 // stack not contiguous 6236 printf("%s: stack not contiguous at ST%d\n", s, i); 6237 state->print(); 6238 assert(false, "error"); 6239 return false; 6240 } 6241 // check if computed stack depth corresponds to expected stack depth 6242 if (stack_depth < 0) { 6243 // expected stack depth is -stack_depth or less 6244 if (d > -stack_depth) { 6245 // too many elements on the stack 6246 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d); 6247 state->print(); 6248 assert(false, "error"); 6249 return false; 6250 } 6251 } else { 6252 // expected stack depth is stack_depth 6253 if (d != stack_depth) { 6254 // wrong stack depth 6255 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d); 6256 state->print(); 6257 assert(false, "error"); 6258 return false; 6259 } 6260 } 6261 // everything is cool 6262 return true; 6263 } 6264 6265 void MacroAssembler::verify_FPU(int stack_depth, const char* s) { 6266 if (!VerifyFPU) return; 6267 push_CPU_state(); 6268 push(rsp); // pass CPU state 6269 ExternalAddress msg((address) s); 6270 // pass message string s 6271 pushptr(msg.addr(), noreg); 6272 push(stack_depth); // pass stack depth 6273 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); 6274 addptr(rsp, 3 * wordSize); // discard arguments 6275 // check for error 6276 { Label L; 6277 testl(rax, rax); 6278 jcc(Assembler::notZero, L); 6279 int3(); // break if error condition 6280 bind(L); 6281 } 6282 pop_CPU_state(); 6283 } 6284 #endif // _LP64 6285 6286 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) { 6287 // Either restore the MXCSR register after returning from the JNI Call 6288 // or verify that it wasn't changed (with -Xcheck:jni flag). 6289 if (VM_Version::supports_sse()) { 6290 if (RestoreMXCSROnJNICalls) { 6291 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch); 6292 } else if (CheckJNICalls) { 6293 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); 6294 } 6295 } 6296 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. 6297 vzeroupper(); 6298 6299 #ifndef _LP64 6300 // Either restore the x87 floating pointer control word after returning 6301 // from the JNI call or verify that it wasn't changed. 6302 if (CheckJNICalls) { 6303 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); 6304 } 6305 #endif // _LP64 6306 } 6307 6308 // ((OopHandle)result).resolve(); 6309 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 6310 assert_different_registers(result, tmp); 6311 6312 // Only 64 bit platforms support GCs that require a tmp register 6313 // Only IN_HEAP loads require a thread_tmp register 6314 // OopHandle::resolve is an indirection like jobject. 6315 access_load_at(T_OBJECT, IN_NATIVE, 6316 result, Address(result, 0), tmp, /*tmp_thread*/noreg); 6317 } 6318 6319 // ((WeakHandle)result).resolve(); 6320 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) { 6321 assert_different_registers(rresult, rtmp); 6322 Label resolved; 6323 6324 // A null weak handle resolves to null. 6325 cmpptr(rresult, 0); 6326 jcc(Assembler::equal, resolved); 6327 6328 // Only 64 bit platforms support GCs that require a tmp register 6329 // Only IN_HEAP loads require a thread_tmp register 6330 // WeakHandle::resolve is an indirection like jweak. 6331 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 6332 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg); 6333 bind(resolved); 6334 } 6335 6336 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) { 6337 // get mirror 6338 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 6339 load_method_holder(mirror, method); 6340 movptr(mirror, Address(mirror, mirror_offset)); 6341 resolve_oop_handle(mirror, tmp); 6342 } 6343 6344 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) { 6345 load_method_holder(rresult, rmethod); 6346 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset())); 6347 } 6348 6349 void MacroAssembler::load_method_holder(Register holder, Register method) { 6350 movptr(holder, Address(method, Method::const_offset())); // ConstMethod* 6351 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 6352 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass* 6353 } 6354 6355 void MacroAssembler::load_metadata(Register dst, Register src) { 6356 #ifdef _LP64 6357 if (UseCompactObjectHeaders) { 6358 load_narrow_klass_compact(dst, src); 6359 } else if (UseCompressedClassPointers) { 6360 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 6361 } else 6362 #endif 6363 { 6364 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 6365 } 6366 } 6367 6368 #ifdef _LP64 6369 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) { 6370 assert(UseCompactObjectHeaders, "expect compact object headers"); 6371 movq(dst, Address(src, oopDesc::mark_offset_in_bytes())); 6372 shrq(dst, markWord::klass_shift); 6373 } 6374 #endif 6375 6376 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) { 6377 assert_different_registers(src, tmp); 6378 assert_different_registers(dst, tmp); 6379 #ifdef _LP64 6380 if (UseCompactObjectHeaders) { 6381 load_narrow_klass_compact(dst, src); 6382 decode_klass_not_null(dst, tmp); 6383 } else if (UseCompressedClassPointers) { 6384 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 6385 decode_klass_not_null(dst, tmp); 6386 } else 6387 #endif 6388 { 6389 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 6390 } 6391 } 6392 6393 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) { 6394 load_klass(dst, src, tmp); 6395 movptr(dst, Address(dst, Klass::prototype_header_offset())); 6396 } 6397 6398 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) { 6399 assert(!UseCompactObjectHeaders, "not with compact headers"); 6400 assert_different_registers(src, tmp); 6401 assert_different_registers(dst, tmp); 6402 #ifdef _LP64 6403 if (UseCompressedClassPointers) { 6404 encode_klass_not_null(src, tmp); 6405 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); 6406 } else 6407 #endif 6408 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); 6409 } 6410 6411 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) { 6412 #ifdef _LP64 6413 if (UseCompactObjectHeaders) { 6414 assert(tmp != noreg, "need tmp"); 6415 assert_different_registers(klass, obj, tmp); 6416 load_narrow_klass_compact(tmp, obj); 6417 cmpl(klass, tmp); 6418 } else if (UseCompressedClassPointers) { 6419 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes())); 6420 } else 6421 #endif 6422 { 6423 cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes())); 6424 } 6425 } 6426 6427 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) { 6428 #ifdef _LP64 6429 if (UseCompactObjectHeaders) { 6430 assert(tmp2 != noreg, "need tmp2"); 6431 assert_different_registers(obj1, obj2, tmp1, tmp2); 6432 load_narrow_klass_compact(tmp1, obj1); 6433 load_narrow_klass_compact(tmp2, obj2); 6434 cmpl(tmp1, tmp2); 6435 } else if (UseCompressedClassPointers) { 6436 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes())); 6437 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes())); 6438 } else 6439 #endif 6440 { 6441 movptr(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes())); 6442 cmpptr(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes())); 6443 } 6444 } 6445 6446 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src, 6447 Register tmp1, Register thread_tmp) { 6448 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 6449 decorators = AccessInternal::decorator_fixup(decorators, type); 6450 bool as_raw = (decorators & AS_RAW) != 0; 6451 if (as_raw) { 6452 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 6453 } else { 6454 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 6455 } 6456 } 6457 6458 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val, 6459 Register tmp1, Register tmp2, Register tmp3) { 6460 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 6461 decorators = AccessInternal::decorator_fixup(decorators, type); 6462 bool as_raw = (decorators & AS_RAW) != 0; 6463 if (as_raw) { 6464 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3); 6465 } else { 6466 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3); 6467 } 6468 } 6469 6470 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst, 6471 Register inline_layout_info) { 6472 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 6473 bs->flat_field_copy(this, decorators, src, dst, inline_layout_info); 6474 } 6475 6476 void MacroAssembler::payload_offset(Register inline_klass, Register offset) { 6477 movptr(offset, Address(inline_klass, InstanceKlass::adr_inlineklass_fixed_block_offset())); 6478 movl(offset, Address(offset, InlineKlass::payload_offset_offset())); 6479 } 6480 6481 void MacroAssembler::payload_addr(Register oop, Register data, Register inline_klass) { 6482 // ((address) (void*) o) + vk->payload_offset(); 6483 Register offset = (data == oop) ? rscratch1 : data; 6484 payload_offset(inline_klass, offset); 6485 if (data == oop) { 6486 addptr(data, offset); 6487 } else { 6488 lea(data, Address(oop, offset)); 6489 } 6490 } 6491 6492 void MacroAssembler::data_for_value_array_index(Register array, Register array_klass, 6493 Register index, Register data) { 6494 assert(index != rcx, "index needs to shift by rcx"); 6495 assert_different_registers(array, array_klass, index); 6496 assert_different_registers(rcx, array, index); 6497 6498 // array->base() + (index << Klass::layout_helper_log2_element_size(lh)); 6499 movl(rcx, Address(array_klass, Klass::layout_helper_offset())); 6500 6501 // Klass::layout_helper_log2_element_size(lh) 6502 // (lh >> _lh_log2_element_size_shift) & _lh_log2_element_size_mask; 6503 shrl(rcx, Klass::_lh_log2_element_size_shift); 6504 andl(rcx, Klass::_lh_log2_element_size_mask); 6505 shlptr(index); // index << rcx 6506 6507 lea(data, Address(array, index, Address::times_1, arrayOopDesc::base_offset_in_bytes(T_FLAT_ELEMENT))); 6508 } 6509 6510 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 6511 Register thread_tmp, DecoratorSet decorators) { 6512 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 6513 } 6514 6515 // Doesn't do verification, generates fixed size code 6516 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 6517 Register thread_tmp, DecoratorSet decorators) { 6518 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 6519 } 6520 6521 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1, 6522 Register tmp2, Register tmp3, DecoratorSet decorators) { 6523 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3); 6524 } 6525 6526 // Used for storing nulls. 6527 void MacroAssembler::store_heap_oop_null(Address dst) { 6528 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg); 6529 } 6530 6531 #ifdef _LP64 6532 void MacroAssembler::store_klass_gap(Register dst, Register src) { 6533 assert(!UseCompactObjectHeaders, "Don't use with compact headers"); 6534 if (UseCompressedClassPointers) { 6535 // Store to klass gap in destination 6536 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); 6537 } 6538 } 6539 6540 #ifdef ASSERT 6541 void MacroAssembler::verify_heapbase(const char* msg) { 6542 assert (UseCompressedOops, "should be compressed"); 6543 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6544 if (CheckCompressedOops) { 6545 Label ok; 6546 ExternalAddress src2(CompressedOops::base_addr()); 6547 const bool is_src2_reachable = reachable(src2); 6548 if (!is_src2_reachable) { 6549 push(rscratch1); // cmpptr trashes rscratch1 6550 } 6551 cmpptr(r12_heapbase, src2, rscratch1); 6552 jcc(Assembler::equal, ok); 6553 STOP(msg); 6554 bind(ok); 6555 if (!is_src2_reachable) { 6556 pop(rscratch1); 6557 } 6558 } 6559 } 6560 #endif 6561 6562 // Algorithm must match oop.inline.hpp encode_heap_oop. 6563 void MacroAssembler::encode_heap_oop(Register r) { 6564 #ifdef ASSERT 6565 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 6566 #endif 6567 verify_oop_msg(r, "broken oop in encode_heap_oop"); 6568 if (CompressedOops::base() == nullptr) { 6569 if (CompressedOops::shift() != 0) { 6570 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6571 shrq(r, LogMinObjAlignmentInBytes); 6572 } 6573 return; 6574 } 6575 testq(r, r); 6576 cmovq(Assembler::equal, r, r12_heapbase); 6577 subq(r, r12_heapbase); 6578 shrq(r, LogMinObjAlignmentInBytes); 6579 } 6580 6581 void MacroAssembler::encode_heap_oop_not_null(Register r) { 6582 #ifdef ASSERT 6583 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 6584 if (CheckCompressedOops) { 6585 Label ok; 6586 testq(r, r); 6587 jcc(Assembler::notEqual, ok); 6588 STOP("null oop passed to encode_heap_oop_not_null"); 6589 bind(ok); 6590 } 6591 #endif 6592 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null"); 6593 if (CompressedOops::base() != nullptr) { 6594 subq(r, r12_heapbase); 6595 } 6596 if (CompressedOops::shift() != 0) { 6597 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6598 shrq(r, LogMinObjAlignmentInBytes); 6599 } 6600 } 6601 6602 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 6603 #ifdef ASSERT 6604 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 6605 if (CheckCompressedOops) { 6606 Label ok; 6607 testq(src, src); 6608 jcc(Assembler::notEqual, ok); 6609 STOP("null oop passed to encode_heap_oop_not_null2"); 6610 bind(ok); 6611 } 6612 #endif 6613 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2"); 6614 if (dst != src) { 6615 movq(dst, src); 6616 } 6617 if (CompressedOops::base() != nullptr) { 6618 subq(dst, r12_heapbase); 6619 } 6620 if (CompressedOops::shift() != 0) { 6621 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6622 shrq(dst, LogMinObjAlignmentInBytes); 6623 } 6624 } 6625 6626 void MacroAssembler::decode_heap_oop(Register r) { 6627 #ifdef ASSERT 6628 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 6629 #endif 6630 if (CompressedOops::base() == nullptr) { 6631 if (CompressedOops::shift() != 0) { 6632 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6633 shlq(r, LogMinObjAlignmentInBytes); 6634 } 6635 } else { 6636 Label done; 6637 shlq(r, LogMinObjAlignmentInBytes); 6638 jccb(Assembler::equal, done); 6639 addq(r, r12_heapbase); 6640 bind(done); 6641 } 6642 verify_oop_msg(r, "broken oop in decode_heap_oop"); 6643 } 6644 6645 void MacroAssembler::decode_heap_oop_not_null(Register r) { 6646 // Note: it will change flags 6647 assert (UseCompressedOops, "should only be used for compressed headers"); 6648 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6649 // Cannot assert, unverified entry point counts instructions (see .ad file) 6650 // vtableStubs also counts instructions in pd_code_size_limit. 6651 // Also do not verify_oop as this is called by verify_oop. 6652 if (CompressedOops::shift() != 0) { 6653 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6654 shlq(r, LogMinObjAlignmentInBytes); 6655 if (CompressedOops::base() != nullptr) { 6656 addq(r, r12_heapbase); 6657 } 6658 } else { 6659 assert (CompressedOops::base() == nullptr, "sanity"); 6660 } 6661 } 6662 6663 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 6664 // Note: it will change flags 6665 assert (UseCompressedOops, "should only be used for compressed headers"); 6666 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6667 // Cannot assert, unverified entry point counts instructions (see .ad file) 6668 // vtableStubs also counts instructions in pd_code_size_limit. 6669 // Also do not verify_oop as this is called by verify_oop. 6670 if (CompressedOops::shift() != 0) { 6671 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6672 if (LogMinObjAlignmentInBytes == Address::times_8) { 6673 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); 6674 } else { 6675 if (dst != src) { 6676 movq(dst, src); 6677 } 6678 shlq(dst, LogMinObjAlignmentInBytes); 6679 if (CompressedOops::base() != nullptr) { 6680 addq(dst, r12_heapbase); 6681 } 6682 } 6683 } else { 6684 assert (CompressedOops::base() == nullptr, "sanity"); 6685 if (dst != src) { 6686 movq(dst, src); 6687 } 6688 } 6689 } 6690 6691 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) { 6692 assert_different_registers(r, tmp); 6693 if (CompressedKlassPointers::base() != nullptr) { 6694 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 6695 subq(r, tmp); 6696 } 6697 if (CompressedKlassPointers::shift() != 0) { 6698 shrq(r, CompressedKlassPointers::shift()); 6699 } 6700 } 6701 6702 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) { 6703 assert_different_registers(src, dst); 6704 if (CompressedKlassPointers::base() != nullptr) { 6705 mov64(dst, -(int64_t)CompressedKlassPointers::base()); 6706 addq(dst, src); 6707 } else { 6708 movptr(dst, src); 6709 } 6710 if (CompressedKlassPointers::shift() != 0) { 6711 shrq(dst, CompressedKlassPointers::shift()); 6712 } 6713 } 6714 6715 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) { 6716 assert_different_registers(r, tmp); 6717 // Note: it will change flags 6718 assert(UseCompressedClassPointers, "should only be used for compressed headers"); 6719 // Cannot assert, unverified entry point counts instructions (see .ad file) 6720 // vtableStubs also counts instructions in pd_code_size_limit. 6721 // Also do not verify_oop as this is called by verify_oop. 6722 if (CompressedKlassPointers::shift() != 0) { 6723 shlq(r, CompressedKlassPointers::shift()); 6724 } 6725 if (CompressedKlassPointers::base() != nullptr) { 6726 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 6727 addq(r, tmp); 6728 } 6729 } 6730 6731 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) { 6732 assert_different_registers(src, dst); 6733 // Note: it will change flags 6734 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6735 // Cannot assert, unverified entry point counts instructions (see .ad file) 6736 // vtableStubs also counts instructions in pd_code_size_limit. 6737 // Also do not verify_oop as this is called by verify_oop. 6738 6739 if (CompressedKlassPointers::base() == nullptr && 6740 CompressedKlassPointers::shift() == 0) { 6741 // The best case scenario is that there is no base or shift. Then it is already 6742 // a pointer that needs nothing but a register rename. 6743 movl(dst, src); 6744 } else { 6745 if (CompressedKlassPointers::shift() <= Address::times_8) { 6746 if (CompressedKlassPointers::base() != nullptr) { 6747 mov64(dst, (int64_t)CompressedKlassPointers::base()); 6748 } else { 6749 xorq(dst, dst); 6750 } 6751 if (CompressedKlassPointers::shift() != 0) { 6752 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?"); 6753 leaq(dst, Address(dst, src, Address::times_8, 0)); 6754 } else { 6755 addq(dst, src); 6756 } 6757 } else { 6758 if (CompressedKlassPointers::base() != nullptr) { 6759 const uint64_t base_right_shifted = 6760 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift(); 6761 mov64(dst, base_right_shifted); 6762 } else { 6763 xorq(dst, dst); 6764 } 6765 addq(dst, src); 6766 shlq(dst, CompressedKlassPointers::shift()); 6767 } 6768 } 6769 } 6770 6771 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 6772 assert (UseCompressedOops, "should only be used for compressed headers"); 6773 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6774 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6775 int oop_index = oop_recorder()->find_index(obj); 6776 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6777 mov_narrow_oop(dst, oop_index, rspec); 6778 } 6779 6780 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { 6781 assert (UseCompressedOops, "should only be used for compressed headers"); 6782 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6783 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6784 int oop_index = oop_recorder()->find_index(obj); 6785 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6786 mov_narrow_oop(dst, oop_index, rspec); 6787 } 6788 6789 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 6790 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6791 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6792 int klass_index = oop_recorder()->find_index(k); 6793 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6794 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6795 } 6796 6797 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { 6798 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6799 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6800 int klass_index = oop_recorder()->find_index(k); 6801 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6802 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6803 } 6804 6805 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { 6806 assert (UseCompressedOops, "should only be used for compressed headers"); 6807 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6808 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6809 int oop_index = oop_recorder()->find_index(obj); 6810 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6811 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 6812 } 6813 6814 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { 6815 assert (UseCompressedOops, "should only be used for compressed headers"); 6816 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6817 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6818 int oop_index = oop_recorder()->find_index(obj); 6819 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6820 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 6821 } 6822 6823 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { 6824 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6825 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6826 int klass_index = oop_recorder()->find_index(k); 6827 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6828 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6829 } 6830 6831 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { 6832 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6833 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6834 int klass_index = oop_recorder()->find_index(k); 6835 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6836 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6837 } 6838 6839 void MacroAssembler::reinit_heapbase() { 6840 if (UseCompressedOops) { 6841 if (Universe::heap() != nullptr) { 6842 if (CompressedOops::base() == nullptr) { 6843 MacroAssembler::xorptr(r12_heapbase, r12_heapbase); 6844 } else { 6845 mov64(r12_heapbase, (int64_t)CompressedOops::base()); 6846 } 6847 } else { 6848 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr())); 6849 } 6850 } 6851 } 6852 6853 #endif // _LP64 6854 6855 #if COMPILER2_OR_JVMCI 6856 6857 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers 6858 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, KRegister mask) { 6859 // cnt - number of qwords (8-byte words). 6860 // base - start address, qword aligned. 6861 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; 6862 bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0); 6863 if (use64byteVector) { 6864 evpbroadcastq(xtmp, val, AVX_512bit); 6865 } else if (MaxVectorSize >= 32) { 6866 movdq(xtmp, val); 6867 punpcklqdq(xtmp, xtmp); 6868 vinserti128_high(xtmp, xtmp); 6869 } else { 6870 movdq(xtmp, val); 6871 punpcklqdq(xtmp, xtmp); 6872 } 6873 jmp(L_zero_64_bytes); 6874 6875 BIND(L_loop); 6876 if (MaxVectorSize >= 32) { 6877 fill64(base, 0, xtmp, use64byteVector); 6878 } else { 6879 movdqu(Address(base, 0), xtmp); 6880 movdqu(Address(base, 16), xtmp); 6881 movdqu(Address(base, 32), xtmp); 6882 movdqu(Address(base, 48), xtmp); 6883 } 6884 addptr(base, 64); 6885 6886 BIND(L_zero_64_bytes); 6887 subptr(cnt, 8); 6888 jccb(Assembler::greaterEqual, L_loop); 6889 6890 // Copy trailing 64 bytes 6891 if (use64byteVector) { 6892 addptr(cnt, 8); 6893 jccb(Assembler::equal, L_end); 6894 fill64_masked(3, base, 0, xtmp, mask, cnt, val, true); 6895 jmp(L_end); 6896 } else { 6897 addptr(cnt, 4); 6898 jccb(Assembler::less, L_tail); 6899 if (MaxVectorSize >= 32) { 6900 vmovdqu(Address(base, 0), xtmp); 6901 } else { 6902 movdqu(Address(base, 0), xtmp); 6903 movdqu(Address(base, 16), xtmp); 6904 } 6905 } 6906 addptr(base, 32); 6907 subptr(cnt, 4); 6908 6909 BIND(L_tail); 6910 addptr(cnt, 4); 6911 jccb(Assembler::lessEqual, L_end); 6912 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) { 6913 fill32_masked(3, base, 0, xtmp, mask, cnt, val); 6914 } else { 6915 decrement(cnt); 6916 6917 BIND(L_sloop); 6918 movq(Address(base, 0), xtmp); 6919 addptr(base, 8); 6920 decrement(cnt); 6921 jccb(Assembler::greaterEqual, L_sloop); 6922 } 6923 BIND(L_end); 6924 } 6925 6926 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) { 6927 assert(InlineTypeReturnedAsFields, "Inline types should never be returned as fields"); 6928 // An inline type might be returned. If fields are in registers we 6929 // need to allocate an inline type instance and initialize it with 6930 // the value of the fields. 6931 Label skip; 6932 // We only need a new buffered inline type if a new one is not returned 6933 testptr(rax, 1); 6934 jcc(Assembler::zero, skip); 6935 int call_offset = -1; 6936 6937 #ifdef _LP64 6938 // The following code is similar to allocate_instance but has some slight differences, 6939 // e.g. object size is always not zero, sometimes it's constant; storing klass ptr after 6940 // allocating is not necessary if vk != nullptr, etc. allocate_instance is not aware of these. 6941 Label slow_case; 6942 // 1. Try to allocate a new buffered inline instance either from TLAB or eden space 6943 mov(rscratch1, rax); // save rax for slow_case since *_allocate may corrupt it when allocation failed 6944 if (vk != nullptr) { 6945 // Called from C1, where the return type is statically known. 6946 movptr(rbx, (intptr_t)vk->get_InlineKlass()); 6947 jint lh = vk->layout_helper(); 6948 assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved"); 6949 if (UseTLAB && !Klass::layout_helper_needs_slow_path(lh)) { 6950 tlab_allocate(r15_thread, rax, noreg, lh, r13, r14, slow_case); 6951 } else { 6952 jmp(slow_case); 6953 } 6954 } else { 6955 // Call from interpreter. RAX contains ((the InlineKlass* of the return type) | 0x01) 6956 mov(rbx, rax); 6957 andptr(rbx, -2); 6958 if (UseTLAB) { 6959 movl(r14, Address(rbx, Klass::layout_helper_offset())); 6960 testl(r14, Klass::_lh_instance_slow_path_bit); 6961 jcc(Assembler::notZero, slow_case); 6962 tlab_allocate(r15_thread, rax, r14, 0, r13, r14, slow_case); 6963 } else { 6964 jmp(slow_case); 6965 } 6966 } 6967 if (UseTLAB) { 6968 // 2. Initialize buffered inline instance header 6969 Register buffer_obj = rax; 6970 if (UseCompactObjectHeaders) { 6971 Register mark_word = r13; 6972 movptr(mark_word, Address(rbx, Klass::prototype_header_offset())); 6973 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes ()), mark_word); 6974 } else { 6975 movptr(Address(buffer_obj, oopDesc::mark_offset_in_bytes()), (intptr_t)markWord::inline_type_prototype().value()); 6976 xorl(r13, r13); 6977 store_klass_gap(buffer_obj, r13); 6978 if (vk == nullptr) { 6979 // store_klass corrupts rbx(klass), so save it in r13 for later use (interpreter case only). 6980 mov(r13, rbx); 6981 } 6982 store_klass(buffer_obj, rbx, rscratch1); 6983 } 6984 // 3. Initialize its fields with an inline class specific handler 6985 if (vk != nullptr) { 6986 call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint. 6987 } else { 6988 movptr(rbx, Address(r13, InstanceKlass::adr_inlineklass_fixed_block_offset())); 6989 movptr(rbx, Address(rbx, InlineKlass::pack_handler_offset())); 6990 call(rbx); 6991 } 6992 jmp(skip); 6993 } 6994 bind(slow_case); 6995 // We failed to allocate a new inline type, fall back to a runtime 6996 // call. Some oop field may be live in some registers but we can't 6997 // tell. That runtime call will take care of preserving them 6998 // across a GC if there's one. 6999 mov(rax, rscratch1); 7000 #endif 7001 7002 if (from_interpreter) { 7003 super_call_VM_leaf(StubRoutines::store_inline_type_fields_to_buf()); 7004 } else { 7005 call(RuntimeAddress(StubRoutines::store_inline_type_fields_to_buf())); 7006 call_offset = offset(); 7007 } 7008 7009 bind(skip); 7010 return call_offset; 7011 } 7012 7013 // Move a value between registers/stack slots and update the reg_state 7014 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) { 7015 assert(from->is_valid() && to->is_valid(), "source and destination must be valid"); 7016 if (reg_state[to->value()] == reg_written) { 7017 return true; // Already written 7018 } 7019 if (from != to && bt != T_VOID) { 7020 if (reg_state[to->value()] == reg_readonly) { 7021 return false; // Not yet writable 7022 } 7023 if (from->is_reg()) { 7024 if (to->is_reg()) { 7025 if (from->is_XMMRegister()) { 7026 if (bt == T_DOUBLE) { 7027 movdbl(to->as_XMMRegister(), from->as_XMMRegister()); 7028 } else { 7029 assert(bt == T_FLOAT, "must be float"); 7030 movflt(to->as_XMMRegister(), from->as_XMMRegister()); 7031 } 7032 } else { 7033 movq(to->as_Register(), from->as_Register()); 7034 } 7035 } else { 7036 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7037 Address to_addr = Address(rsp, st_off); 7038 if (from->is_XMMRegister()) { 7039 if (bt == T_DOUBLE) { 7040 movdbl(to_addr, from->as_XMMRegister()); 7041 } else { 7042 assert(bt == T_FLOAT, "must be float"); 7043 movflt(to_addr, from->as_XMMRegister()); 7044 } 7045 } else { 7046 movq(to_addr, from->as_Register()); 7047 } 7048 } 7049 } else { 7050 Address from_addr = Address(rsp, from->reg2stack() * VMRegImpl::stack_slot_size + wordSize); 7051 if (to->is_reg()) { 7052 if (to->is_XMMRegister()) { 7053 if (bt == T_DOUBLE) { 7054 movdbl(to->as_XMMRegister(), from_addr); 7055 } else { 7056 assert(bt == T_FLOAT, "must be float"); 7057 movflt(to->as_XMMRegister(), from_addr); 7058 } 7059 } else { 7060 movq(to->as_Register(), from_addr); 7061 } 7062 } else { 7063 int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7064 movq(r13, from_addr); 7065 movq(Address(rsp, st_off), r13); 7066 } 7067 } 7068 } 7069 // Update register states 7070 reg_state[from->value()] = reg_writable; 7071 reg_state[to->value()] = reg_written; 7072 return true; 7073 } 7074 7075 // Calculate the extra stack space required for packing or unpacking inline 7076 // args and adjust the stack pointer 7077 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) { 7078 // Two additional slots to account for return address 7079 int sp_inc = (args_on_stack + 2) * VMRegImpl::stack_slot_size; 7080 sp_inc = align_up(sp_inc, StackAlignmentInBytes); 7081 // Save the return address, adjust the stack (make sure it is properly 7082 // 16-byte aligned) and copy the return address to the new top of the stack. 7083 // The stack will be repaired on return (see MacroAssembler::remove_frame). 7084 assert(sp_inc > 0, "sanity"); 7085 pop(r13); 7086 subptr(rsp, sp_inc); 7087 push(r13); 7088 return sp_inc; 7089 } 7090 7091 // Read all fields from an inline type buffer and store the field values in registers/stack slots. 7092 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, 7093 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index, 7094 RegState reg_state[]) { 7095 assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter"); 7096 assert(from->is_valid(), "source must be valid"); 7097 bool progress = false; 7098 #ifdef ASSERT 7099 const int start_offset = offset(); 7100 #endif 7101 7102 Label L_null, L_notNull; 7103 // Don't use r14 as tmp because it's used for spilling (see MacroAssembler::spill_reg_for) 7104 Register tmp1 = r10; 7105 Register tmp2 = r13; 7106 Register fromReg = noreg; 7107 ScalarizedInlineArgsStream stream(sig, sig_index, to, to_count, to_index, -1); 7108 bool done = true; 7109 bool mark_done = true; 7110 VMReg toReg; 7111 BasicType bt; 7112 // Check if argument requires a null check 7113 bool null_check = false; 7114 VMReg nullCheckReg; 7115 while (stream.next(nullCheckReg, bt)) { 7116 if (sig->at(stream.sig_index())._offset == -1) { 7117 null_check = true; 7118 break; 7119 } 7120 } 7121 stream.reset(sig_index, to_index); 7122 while (stream.next(toReg, bt)) { 7123 assert(toReg->is_valid(), "destination must be valid"); 7124 int idx = (int)toReg->value(); 7125 if (reg_state[idx] == reg_readonly) { 7126 if (idx != from->value()) { 7127 mark_done = false; 7128 } 7129 done = false; 7130 continue; 7131 } else if (reg_state[idx] == reg_written) { 7132 continue; 7133 } 7134 assert(reg_state[idx] == reg_writable, "must be writable"); 7135 reg_state[idx] = reg_written; 7136 progress = true; 7137 7138 if (fromReg == noreg) { 7139 if (from->is_reg()) { 7140 fromReg = from->as_Register(); 7141 } else { 7142 int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7143 movq(tmp1, Address(rsp, st_off)); 7144 fromReg = tmp1; 7145 } 7146 if (null_check) { 7147 // Nullable inline type argument, emit null check 7148 testptr(fromReg, fromReg); 7149 jcc(Assembler::zero, L_null); 7150 } 7151 } 7152 int off = sig->at(stream.sig_index())._offset; 7153 if (off == -1) { 7154 assert(null_check, "Missing null check at"); 7155 if (toReg->is_stack()) { 7156 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7157 movq(Address(rsp, st_off), 1); 7158 } else { 7159 movq(toReg->as_Register(), 1); 7160 } 7161 continue; 7162 } 7163 assert(off > 0, "offset in object should be positive"); 7164 Address fromAddr = Address(fromReg, off); 7165 if (!toReg->is_XMMRegister()) { 7166 Register dst = toReg->is_stack() ? tmp2 : toReg->as_Register(); 7167 if (is_reference_type(bt)) { 7168 load_heap_oop(dst, fromAddr); 7169 } else { 7170 bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN); 7171 load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed); 7172 } 7173 if (toReg->is_stack()) { 7174 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7175 movq(Address(rsp, st_off), dst); 7176 } 7177 } else if (bt == T_DOUBLE) { 7178 movdbl(toReg->as_XMMRegister(), fromAddr); 7179 } else { 7180 assert(bt == T_FLOAT, "must be float"); 7181 movflt(toReg->as_XMMRegister(), fromAddr); 7182 } 7183 } 7184 if (progress && null_check) { 7185 if (done) { 7186 jmp(L_notNull); 7187 bind(L_null); 7188 // Set IsInit field to zero to signal that the argument is null. 7189 // Also set all oop fields to zero to make the GC happy. 7190 stream.reset(sig_index, to_index); 7191 while (stream.next(toReg, bt)) { 7192 if (sig->at(stream.sig_index())._offset == -1 || 7193 bt == T_OBJECT || bt == T_ARRAY) { 7194 if (toReg->is_stack()) { 7195 int st_off = toReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7196 movq(Address(rsp, st_off), 0); 7197 } else { 7198 xorq(toReg->as_Register(), toReg->as_Register()); 7199 } 7200 } 7201 } 7202 bind(L_notNull); 7203 } else { 7204 bind(L_null); 7205 } 7206 } 7207 7208 sig_index = stream.sig_index(); 7209 to_index = stream.regs_index(); 7210 7211 if (mark_done && reg_state[from->value()] != reg_written) { 7212 // This is okay because no one else will write to that slot 7213 reg_state[from->value()] = reg_writable; 7214 } 7215 from_index--; 7216 assert(progress || (start_offset == offset()), "should not emit code"); 7217 return done; 7218 } 7219 7220 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index, 7221 VMRegPair* from, int from_count, int& from_index, VMReg to, 7222 RegState reg_state[], Register val_array) { 7223 assert(sig->at(sig_index)._bt == T_METADATA, "should be at delimiter"); 7224 assert(to->is_valid(), "destination must be valid"); 7225 7226 if (reg_state[to->value()] == reg_written) { 7227 skip_unpacked_fields(sig, sig_index, from, from_count, from_index); 7228 return true; // Already written 7229 } 7230 7231 // TODO 8284443 Isn't it an issue if below code uses r14 as tmp when it contains a spilled value? 7232 // Be careful with r14 because it's used for spilling (see MacroAssembler::spill_reg_for). 7233 Register val_obj_tmp = r11; 7234 Register from_reg_tmp = r14; 7235 Register tmp1 = r10; 7236 Register tmp2 = r13; 7237 Register tmp3 = rbx; 7238 Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register(); 7239 7240 assert_different_registers(val_obj_tmp, from_reg_tmp, tmp1, tmp2, tmp3, val_array); 7241 7242 if (reg_state[to->value()] == reg_readonly) { 7243 if (!is_reg_in_unpacked_fields(sig, sig_index, to, from, from_count, from_index)) { 7244 skip_unpacked_fields(sig, sig_index, from, from_count, from_index); 7245 return false; // Not yet writable 7246 } 7247 val_obj = val_obj_tmp; 7248 } 7249 7250 int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_OBJECT); 7251 load_heap_oop(val_obj, Address(val_array, index)); 7252 7253 ScalarizedInlineArgsStream stream(sig, sig_index, from, from_count, from_index); 7254 VMReg fromReg; 7255 BasicType bt; 7256 Label L_null; 7257 while (stream.next(fromReg, bt)) { 7258 assert(fromReg->is_valid(), "source must be valid"); 7259 reg_state[fromReg->value()] = reg_writable; 7260 7261 int off = sig->at(stream.sig_index())._offset; 7262 if (off == -1) { 7263 // Nullable inline type argument, emit null check 7264 Label L_notNull; 7265 if (fromReg->is_stack()) { 7266 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7267 testb(Address(rsp, ld_off), 1); 7268 } else { 7269 testb(fromReg->as_Register(), 1); 7270 } 7271 jcc(Assembler::notZero, L_notNull); 7272 movptr(val_obj, 0); 7273 jmp(L_null); 7274 bind(L_notNull); 7275 continue; 7276 } 7277 7278 assert(off > 0, "offset in object should be positive"); 7279 size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize; 7280 7281 Address dst(val_obj, off); 7282 if (!fromReg->is_XMMRegister()) { 7283 Register src; 7284 if (fromReg->is_stack()) { 7285 src = from_reg_tmp; 7286 int ld_off = fromReg->reg2stack() * VMRegImpl::stack_slot_size + wordSize; 7287 load_sized_value(src, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false); 7288 } else { 7289 src = fromReg->as_Register(); 7290 } 7291 assert_different_registers(dst.base(), src, tmp1, tmp2, tmp3, val_array); 7292 if (is_reference_type(bt)) { 7293 store_heap_oop(dst, src, tmp1, tmp2, tmp3, IN_HEAP | ACCESS_WRITE | IS_DEST_UNINITIALIZED); 7294 } else { 7295 store_sized_value(dst, src, size_in_bytes); 7296 } 7297 } else if (bt == T_DOUBLE) { 7298 movdbl(dst, fromReg->as_XMMRegister()); 7299 } else { 7300 assert(bt == T_FLOAT, "must be float"); 7301 movflt(dst, fromReg->as_XMMRegister()); 7302 } 7303 } 7304 bind(L_null); 7305 sig_index = stream.sig_index(); 7306 from_index = stream.regs_index(); 7307 7308 assert(reg_state[to->value()] == reg_writable, "must have already been read"); 7309 bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state); 7310 assert(success, "to register must be writeable"); 7311 return true; 7312 } 7313 7314 VMReg MacroAssembler::spill_reg_for(VMReg reg) { 7315 return reg->is_XMMRegister() ? xmm8->as_VMReg() : r14->as_VMReg(); 7316 } 7317 7318 void MacroAssembler::remove_frame(int initial_framesize, bool needs_stack_repair) { 7319 assert((initial_framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 7320 if (needs_stack_repair) { 7321 movq(rbp, Address(rsp, initial_framesize)); 7322 // The stack increment resides just below the saved rbp 7323 addq(rsp, Address(rsp, initial_framesize - wordSize)); 7324 } else { 7325 if (initial_framesize > 0) { 7326 addq(rsp, initial_framesize); 7327 } 7328 pop(rbp); 7329 } 7330 } 7331 7332 // Clearing constant sized memory using YMM/ZMM registers. 7333 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { 7334 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), ""); 7335 bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 7336 7337 int vector64_count = (cnt & (~0x7)) >> 3; 7338 cnt = cnt & 0x7; 7339 const int fill64_per_loop = 4; 7340 const int max_unrolled_fill64 = 8; 7341 7342 // 64 byte initialization loop. 7343 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit); 7344 int start64 = 0; 7345 if (vector64_count > max_unrolled_fill64) { 7346 Label LOOP; 7347 Register index = rtmp; 7348 7349 start64 = vector64_count - (vector64_count % fill64_per_loop); 7350 7351 movl(index, 0); 7352 BIND(LOOP); 7353 for (int i = 0; i < fill64_per_loop; i++) { 7354 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector); 7355 } 7356 addl(index, fill64_per_loop * 64); 7357 cmpl(index, start64 * 64); 7358 jccb(Assembler::less, LOOP); 7359 } 7360 for (int i = start64; i < vector64_count; i++) { 7361 fill64(base, i * 64, xtmp, use64byteVector); 7362 } 7363 7364 // Clear remaining 64 byte tail. 7365 int disp = vector64_count * 64; 7366 if (cnt) { 7367 switch (cnt) { 7368 case 1: 7369 movq(Address(base, disp), xtmp); 7370 break; 7371 case 2: 7372 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit); 7373 break; 7374 case 3: 7375 movl(rtmp, 0x7); 7376 kmovwl(mask, rtmp); 7377 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit); 7378 break; 7379 case 4: 7380 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 7381 break; 7382 case 5: 7383 if (use64byteVector) { 7384 movl(rtmp, 0x1F); 7385 kmovwl(mask, rtmp); 7386 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 7387 } else { 7388 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 7389 movq(Address(base, disp + 32), xtmp); 7390 } 7391 break; 7392 case 6: 7393 if (use64byteVector) { 7394 movl(rtmp, 0x3F); 7395 kmovwl(mask, rtmp); 7396 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 7397 } else { 7398 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 7399 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit); 7400 } 7401 break; 7402 case 7: 7403 if (use64byteVector) { 7404 movl(rtmp, 0x7F); 7405 kmovwl(mask, rtmp); 7406 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 7407 } else { 7408 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 7409 movl(rtmp, 0x7); 7410 kmovwl(mask, rtmp); 7411 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit); 7412 } 7413 break; 7414 default: 7415 fatal("Unexpected length : %d\n",cnt); 7416 break; 7417 } 7418 } 7419 } 7420 7421 void MacroAssembler::clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, 7422 bool is_large, bool word_copy_only, KRegister mask) { 7423 // cnt - number of qwords (8-byte words). 7424 // base - start address, qword aligned. 7425 // is_large - if optimizers know cnt is larger than InitArrayShortSize 7426 assert(base==rdi, "base register must be edi for rep stos"); 7427 assert(val==rax, "val register must be eax for rep stos"); 7428 assert(cnt==rcx, "cnt register must be ecx for rep stos"); 7429 assert(InitArrayShortSize % BytesPerLong == 0, 7430 "InitArrayShortSize should be the multiple of BytesPerLong"); 7431 7432 Label DONE; 7433 7434 if (!is_large) { 7435 Label LOOP, LONG; 7436 cmpptr(cnt, InitArrayShortSize/BytesPerLong); 7437 jccb(Assembler::greater, LONG); 7438 7439 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 7440 7441 decrement(cnt); 7442 jccb(Assembler::negative, DONE); // Zero length 7443 7444 // Use individual pointer-sized stores for small counts: 7445 BIND(LOOP); 7446 movptr(Address(base, cnt, Address::times_ptr), val); 7447 decrement(cnt); 7448 jccb(Assembler::greaterEqual, LOOP); 7449 jmpb(DONE); 7450 7451 BIND(LONG); 7452 } 7453 7454 // Use longer rep-prefixed ops for non-small counts: 7455 if (UseFastStosb && !word_copy_only) { 7456 shlptr(cnt, 3); // convert to number of bytes 7457 rep_stosb(); 7458 } else if (UseXMMForObjInit) { 7459 xmm_clear_mem(base, cnt, val, xtmp, mask); 7460 } else { 7461 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 7462 rep_stos(); 7463 } 7464 7465 BIND(DONE); 7466 } 7467 7468 #endif //COMPILER2_OR_JVMCI 7469 7470 7471 void MacroAssembler::generate_fill(BasicType t, bool aligned, 7472 Register to, Register value, Register count, 7473 Register rtmp, XMMRegister xtmp) { 7474 ShortBranchVerifier sbv(this); 7475 assert_different_registers(to, value, count, rtmp); 7476 Label L_exit; 7477 Label L_fill_2_bytes, L_fill_4_bytes; 7478 7479 #if defined(COMPILER2) && defined(_LP64) 7480 if(MaxVectorSize >=32 && 7481 VM_Version::supports_avx512vlbw() && 7482 VM_Version::supports_bmi2()) { 7483 generate_fill_avx3(t, to, value, count, rtmp, xtmp); 7484 return; 7485 } 7486 #endif 7487 7488 int shift = -1; 7489 switch (t) { 7490 case T_BYTE: 7491 shift = 2; 7492 break; 7493 case T_SHORT: 7494 shift = 1; 7495 break; 7496 case T_INT: 7497 shift = 0; 7498 break; 7499 default: ShouldNotReachHere(); 7500 } 7501 7502 if (t == T_BYTE) { 7503 andl(value, 0xff); 7504 movl(rtmp, value); 7505 shll(rtmp, 8); 7506 orl(value, rtmp); 7507 } 7508 if (t == T_SHORT) { 7509 andl(value, 0xffff); 7510 } 7511 if (t == T_BYTE || t == T_SHORT) { 7512 movl(rtmp, value); 7513 shll(rtmp, 16); 7514 orl(value, rtmp); 7515 } 7516 7517 cmpptr(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 7518 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp 7519 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 7520 Label L_skip_align2; 7521 // align source address at 4 bytes address boundary 7522 if (t == T_BYTE) { 7523 Label L_skip_align1; 7524 // One byte misalignment happens only for byte arrays 7525 testptr(to, 1); 7526 jccb(Assembler::zero, L_skip_align1); 7527 movb(Address(to, 0), value); 7528 increment(to); 7529 decrement(count); 7530 BIND(L_skip_align1); 7531 } 7532 // Two bytes misalignment happens only for byte and short (char) arrays 7533 testptr(to, 2); 7534 jccb(Assembler::zero, L_skip_align2); 7535 movw(Address(to, 0), value); 7536 addptr(to, 2); 7537 subptr(count, 1<<(shift-1)); 7538 BIND(L_skip_align2); 7539 } 7540 if (UseSSE < 2) { 7541 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 7542 // Fill 32-byte chunks 7543 subptr(count, 8 << shift); 7544 jcc(Assembler::less, L_check_fill_8_bytes); 7545 align(16); 7546 7547 BIND(L_fill_32_bytes_loop); 7548 7549 for (int i = 0; i < 32; i += 4) { 7550 movl(Address(to, i), value); 7551 } 7552 7553 addptr(to, 32); 7554 subptr(count, 8 << shift); 7555 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 7556 BIND(L_check_fill_8_bytes); 7557 addptr(count, 8 << shift); 7558 jccb(Assembler::zero, L_exit); 7559 jmpb(L_fill_8_bytes); 7560 7561 // 7562 // length is too short, just fill qwords 7563 // 7564 BIND(L_fill_8_bytes_loop); 7565 movl(Address(to, 0), value); 7566 movl(Address(to, 4), value); 7567 addptr(to, 8); 7568 BIND(L_fill_8_bytes); 7569 subptr(count, 1 << (shift + 1)); 7570 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 7571 // fall through to fill 4 bytes 7572 } else { 7573 Label L_fill_32_bytes; 7574 if (!UseUnalignedLoadStores) { 7575 // align to 8 bytes, we know we are 4 byte aligned to start 7576 testptr(to, 4); 7577 jccb(Assembler::zero, L_fill_32_bytes); 7578 movl(Address(to, 0), value); 7579 addptr(to, 4); 7580 subptr(count, 1<<shift); 7581 } 7582 BIND(L_fill_32_bytes); 7583 { 7584 assert( UseSSE >= 2, "supported cpu only" ); 7585 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 7586 movdl(xtmp, value); 7587 if (UseAVX >= 2 && UseUnalignedLoadStores) { 7588 Label L_check_fill_32_bytes; 7589 if (UseAVX > 2) { 7590 // Fill 64-byte chunks 7591 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; 7592 7593 // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2 7594 cmpptr(count, VM_Version::avx3_threshold()); 7595 jccb(Assembler::below, L_check_fill_64_bytes_avx2); 7596 7597 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); 7598 7599 subptr(count, 16 << shift); 7600 jccb(Assembler::less, L_check_fill_32_bytes); 7601 align(16); 7602 7603 BIND(L_fill_64_bytes_loop_avx3); 7604 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); 7605 addptr(to, 64); 7606 subptr(count, 16 << shift); 7607 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3); 7608 jmpb(L_check_fill_32_bytes); 7609 7610 BIND(L_check_fill_64_bytes_avx2); 7611 } 7612 // Fill 64-byte chunks 7613 Label L_fill_64_bytes_loop; 7614 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); 7615 7616 subptr(count, 16 << shift); 7617 jcc(Assembler::less, L_check_fill_32_bytes); 7618 align(16); 7619 7620 BIND(L_fill_64_bytes_loop); 7621 vmovdqu(Address(to, 0), xtmp); 7622 vmovdqu(Address(to, 32), xtmp); 7623 addptr(to, 64); 7624 subptr(count, 16 << shift); 7625 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 7626 7627 BIND(L_check_fill_32_bytes); 7628 addptr(count, 8 << shift); 7629 jccb(Assembler::less, L_check_fill_8_bytes); 7630 vmovdqu(Address(to, 0), xtmp); 7631 addptr(to, 32); 7632 subptr(count, 8 << shift); 7633 7634 BIND(L_check_fill_8_bytes); 7635 // clean upper bits of YMM registers 7636 movdl(xtmp, value); 7637 pshufd(xtmp, xtmp, 0); 7638 } else { 7639 // Fill 32-byte chunks 7640 pshufd(xtmp, xtmp, 0); 7641 7642 subptr(count, 8 << shift); 7643 jcc(Assembler::less, L_check_fill_8_bytes); 7644 align(16); 7645 7646 BIND(L_fill_32_bytes_loop); 7647 7648 if (UseUnalignedLoadStores) { 7649 movdqu(Address(to, 0), xtmp); 7650 movdqu(Address(to, 16), xtmp); 7651 } else { 7652 movq(Address(to, 0), xtmp); 7653 movq(Address(to, 8), xtmp); 7654 movq(Address(to, 16), xtmp); 7655 movq(Address(to, 24), xtmp); 7656 } 7657 7658 addptr(to, 32); 7659 subptr(count, 8 << shift); 7660 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 7661 7662 BIND(L_check_fill_8_bytes); 7663 } 7664 addptr(count, 8 << shift); 7665 jccb(Assembler::zero, L_exit); 7666 jmpb(L_fill_8_bytes); 7667 7668 // 7669 // length is too short, just fill qwords 7670 // 7671 BIND(L_fill_8_bytes_loop); 7672 movq(Address(to, 0), xtmp); 7673 addptr(to, 8); 7674 BIND(L_fill_8_bytes); 7675 subptr(count, 1 << (shift + 1)); 7676 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 7677 } 7678 } 7679 // fill trailing 4 bytes 7680 BIND(L_fill_4_bytes); 7681 testl(count, 1<<shift); 7682 jccb(Assembler::zero, L_fill_2_bytes); 7683 movl(Address(to, 0), value); 7684 if (t == T_BYTE || t == T_SHORT) { 7685 Label L_fill_byte; 7686 addptr(to, 4); 7687 BIND(L_fill_2_bytes); 7688 // fill trailing 2 bytes 7689 testl(count, 1<<(shift-1)); 7690 jccb(Assembler::zero, L_fill_byte); 7691 movw(Address(to, 0), value); 7692 if (t == T_BYTE) { 7693 addptr(to, 2); 7694 BIND(L_fill_byte); 7695 // fill trailing byte 7696 testl(count, 1); 7697 jccb(Assembler::zero, L_exit); 7698 movb(Address(to, 0), value); 7699 } else { 7700 BIND(L_fill_byte); 7701 } 7702 } else { 7703 BIND(L_fill_2_bytes); 7704 } 7705 BIND(L_exit); 7706 } 7707 7708 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) { 7709 switch(type) { 7710 case T_BYTE: 7711 case T_BOOLEAN: 7712 evpbroadcastb(dst, src, vector_len); 7713 break; 7714 case T_SHORT: 7715 case T_CHAR: 7716 evpbroadcastw(dst, src, vector_len); 7717 break; 7718 case T_INT: 7719 case T_FLOAT: 7720 evpbroadcastd(dst, src, vector_len); 7721 break; 7722 case T_LONG: 7723 case T_DOUBLE: 7724 evpbroadcastq(dst, src, vector_len); 7725 break; 7726 default: 7727 fatal("Unhandled type : %s", type2name(type)); 7728 break; 7729 } 7730 } 7731 7732 // encode char[] to byte[] in ISO_8859_1 or ASCII 7733 //@IntrinsicCandidate 7734 //private static int implEncodeISOArray(byte[] sa, int sp, 7735 //byte[] da, int dp, int len) { 7736 // int i = 0; 7737 // for (; i < len; i++) { 7738 // char c = StringUTF16.getChar(sa, sp++); 7739 // if (c > '\u00FF') 7740 // break; 7741 // da[dp++] = (byte)c; 7742 // } 7743 // return i; 7744 //} 7745 // 7746 //@IntrinsicCandidate 7747 //private static int implEncodeAsciiArray(char[] sa, int sp, 7748 // byte[] da, int dp, int len) { 7749 // int i = 0; 7750 // for (; i < len; i++) { 7751 // char c = sa[sp++]; 7752 // if (c >= '\u0080') 7753 // break; 7754 // da[dp++] = (byte)c; 7755 // } 7756 // return i; 7757 //} 7758 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, 7759 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 7760 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 7761 Register tmp5, Register result, bool ascii) { 7762 7763 // rsi: src 7764 // rdi: dst 7765 // rdx: len 7766 // rcx: tmp5 7767 // rax: result 7768 ShortBranchVerifier sbv(this); 7769 assert_different_registers(src, dst, len, tmp5, result); 7770 Label L_done, L_copy_1_char, L_copy_1_char_exit; 7771 7772 int mask = ascii ? 0xff80ff80 : 0xff00ff00; 7773 int short_mask = ascii ? 0xff80 : 0xff00; 7774 7775 // set result 7776 xorl(result, result); 7777 // check for zero length 7778 testl(len, len); 7779 jcc(Assembler::zero, L_done); 7780 7781 movl(result, len); 7782 7783 // Setup pointers 7784 lea(src, Address(src, len, Address::times_2)); // char[] 7785 lea(dst, Address(dst, len, Address::times_1)); // byte[] 7786 negptr(len); 7787 7788 if (UseSSE42Intrinsics || UseAVX >= 2) { 7789 Label L_copy_8_chars, L_copy_8_chars_exit; 7790 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; 7791 7792 if (UseAVX >= 2) { 7793 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; 7794 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 7795 movdl(tmp1Reg, tmp5); 7796 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit); 7797 jmp(L_chars_32_check); 7798 7799 bind(L_copy_32_chars); 7800 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); 7801 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); 7802 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 7803 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 7804 jccb(Assembler::notZero, L_copy_32_chars_exit); 7805 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 7806 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); 7807 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); 7808 7809 bind(L_chars_32_check); 7810 addptr(len, 32); 7811 jcc(Assembler::lessEqual, L_copy_32_chars); 7812 7813 bind(L_copy_32_chars_exit); 7814 subptr(len, 16); 7815 jccb(Assembler::greater, L_copy_16_chars_exit); 7816 7817 } else if (UseSSE42Intrinsics) { 7818 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 7819 movdl(tmp1Reg, tmp5); 7820 pshufd(tmp1Reg, tmp1Reg, 0); 7821 jmpb(L_chars_16_check); 7822 } 7823 7824 bind(L_copy_16_chars); 7825 if (UseAVX >= 2) { 7826 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); 7827 vptest(tmp2Reg, tmp1Reg); 7828 jcc(Assembler::notZero, L_copy_16_chars_exit); 7829 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); 7830 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); 7831 } else { 7832 if (UseAVX > 0) { 7833 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 7834 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 7835 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); 7836 } else { 7837 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 7838 por(tmp2Reg, tmp3Reg); 7839 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 7840 por(tmp2Reg, tmp4Reg); 7841 } 7842 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 7843 jccb(Assembler::notZero, L_copy_16_chars_exit); 7844 packuswb(tmp3Reg, tmp4Reg); 7845 } 7846 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); 7847 7848 bind(L_chars_16_check); 7849 addptr(len, 16); 7850 jcc(Assembler::lessEqual, L_copy_16_chars); 7851 7852 bind(L_copy_16_chars_exit); 7853 if (UseAVX >= 2) { 7854 // clean upper bits of YMM registers 7855 vpxor(tmp2Reg, tmp2Reg); 7856 vpxor(tmp3Reg, tmp3Reg); 7857 vpxor(tmp4Reg, tmp4Reg); 7858 movdl(tmp1Reg, tmp5); 7859 pshufd(tmp1Reg, tmp1Reg, 0); 7860 } 7861 subptr(len, 8); 7862 jccb(Assembler::greater, L_copy_8_chars_exit); 7863 7864 bind(L_copy_8_chars); 7865 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); 7866 ptest(tmp3Reg, tmp1Reg); 7867 jccb(Assembler::notZero, L_copy_8_chars_exit); 7868 packuswb(tmp3Reg, tmp1Reg); 7869 movq(Address(dst, len, Address::times_1, -8), tmp3Reg); 7870 addptr(len, 8); 7871 jccb(Assembler::lessEqual, L_copy_8_chars); 7872 7873 bind(L_copy_8_chars_exit); 7874 subptr(len, 8); 7875 jccb(Assembler::zero, L_done); 7876 } 7877 7878 bind(L_copy_1_char); 7879 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 7880 testl(tmp5, short_mask); // check if Unicode or non-ASCII char 7881 jccb(Assembler::notZero, L_copy_1_char_exit); 7882 movb(Address(dst, len, Address::times_1, 0), tmp5); 7883 addptr(len, 1); 7884 jccb(Assembler::less, L_copy_1_char); 7885 7886 bind(L_copy_1_char_exit); 7887 addptr(result, len); // len is negative count of not processed elements 7888 7889 bind(L_done); 7890 } 7891 7892 #ifdef _LP64 7893 /** 7894 * Helper for multiply_to_len(). 7895 */ 7896 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 7897 addq(dest_lo, src1); 7898 adcq(dest_hi, 0); 7899 addq(dest_lo, src2); 7900 adcq(dest_hi, 0); 7901 } 7902 7903 /** 7904 * Multiply 64 bit by 64 bit first loop. 7905 */ 7906 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 7907 Register y, Register y_idx, Register z, 7908 Register carry, Register product, 7909 Register idx, Register kdx) { 7910 // 7911 // jlong carry, x[], y[], z[]; 7912 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7913 // huge_128 product = y[idx] * x[xstart] + carry; 7914 // z[kdx] = (jlong)product; 7915 // carry = (jlong)(product >>> 64); 7916 // } 7917 // z[xstart] = carry; 7918 // 7919 7920 Label L_first_loop, L_first_loop_exit; 7921 Label L_one_x, L_one_y, L_multiply; 7922 7923 decrementl(xstart); 7924 jcc(Assembler::negative, L_one_x); 7925 7926 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7927 rorq(x_xstart, 32); // convert big-endian to little-endian 7928 7929 bind(L_first_loop); 7930 decrementl(idx); 7931 jcc(Assembler::negative, L_first_loop_exit); 7932 decrementl(idx); 7933 jcc(Assembler::negative, L_one_y); 7934 movq(y_idx, Address(y, idx, Address::times_4, 0)); 7935 rorq(y_idx, 32); // convert big-endian to little-endian 7936 bind(L_multiply); 7937 movq(product, x_xstart); 7938 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 7939 addq(product, carry); 7940 adcq(rdx, 0); 7941 subl(kdx, 2); 7942 movl(Address(z, kdx, Address::times_4, 4), product); 7943 shrq(product, 32); 7944 movl(Address(z, kdx, Address::times_4, 0), product); 7945 movq(carry, rdx); 7946 jmp(L_first_loop); 7947 7948 bind(L_one_y); 7949 movl(y_idx, Address(y, 0)); 7950 jmp(L_multiply); 7951 7952 bind(L_one_x); 7953 movl(x_xstart, Address(x, 0)); 7954 jmp(L_first_loop); 7955 7956 bind(L_first_loop_exit); 7957 } 7958 7959 /** 7960 * Multiply 64 bit by 64 bit and add 128 bit. 7961 */ 7962 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 7963 Register yz_idx, Register idx, 7964 Register carry, Register product, int offset) { 7965 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 7966 // z[kdx] = (jlong)product; 7967 7968 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 7969 rorq(yz_idx, 32); // convert big-endian to little-endian 7970 movq(product, x_xstart); 7971 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7972 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 7973 rorq(yz_idx, 32); // convert big-endian to little-endian 7974 7975 add2_with_carry(rdx, product, carry, yz_idx); 7976 7977 movl(Address(z, idx, Address::times_4, offset+4), product); 7978 shrq(product, 32); 7979 movl(Address(z, idx, Address::times_4, offset), product); 7980 7981 } 7982 7983 /** 7984 * Multiply 128 bit by 128 bit. Unrolled inner loop. 7985 */ 7986 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 7987 Register yz_idx, Register idx, Register jdx, 7988 Register carry, Register product, 7989 Register carry2) { 7990 // jlong carry, x[], y[], z[]; 7991 // int kdx = ystart+1; 7992 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7993 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 7994 // z[kdx+idx+1] = (jlong)product; 7995 // jlong carry2 = (jlong)(product >>> 64); 7996 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 7997 // z[kdx+idx] = (jlong)product; 7998 // carry = (jlong)(product >>> 64); 7999 // } 8000 // idx += 2; 8001 // if (idx > 0) { 8002 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 8003 // z[kdx+idx] = (jlong)product; 8004 // carry = (jlong)(product >>> 64); 8005 // } 8006 // 8007 8008 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 8009 8010 movl(jdx, idx); 8011 andl(jdx, 0xFFFFFFFC); 8012 shrl(jdx, 2); 8013 8014 bind(L_third_loop); 8015 subl(jdx, 1); 8016 jcc(Assembler::negative, L_third_loop_exit); 8017 subl(idx, 4); 8018 8019 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 8020 movq(carry2, rdx); 8021 8022 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 8023 movq(carry, rdx); 8024 jmp(L_third_loop); 8025 8026 bind (L_third_loop_exit); 8027 8028 andl (idx, 0x3); 8029 jcc(Assembler::zero, L_post_third_loop_done); 8030 8031 Label L_check_1; 8032 subl(idx, 2); 8033 jcc(Assembler::negative, L_check_1); 8034 8035 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 8036 movq(carry, rdx); 8037 8038 bind (L_check_1); 8039 addl (idx, 0x2); 8040 andl (idx, 0x1); 8041 subl(idx, 1); 8042 jcc(Assembler::negative, L_post_third_loop_done); 8043 8044 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 8045 movq(product, x_xstart); 8046 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 8047 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 8048 8049 add2_with_carry(rdx, product, yz_idx, carry); 8050 8051 movl(Address(z, idx, Address::times_4, 0), product); 8052 shrq(product, 32); 8053 8054 shlq(rdx, 32); 8055 orq(product, rdx); 8056 movq(carry, product); 8057 8058 bind(L_post_third_loop_done); 8059 } 8060 8061 /** 8062 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 8063 * 8064 */ 8065 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 8066 Register carry, Register carry2, 8067 Register idx, Register jdx, 8068 Register yz_idx1, Register yz_idx2, 8069 Register tmp, Register tmp3, Register tmp4) { 8070 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 8071 8072 // jlong carry, x[], y[], z[]; 8073 // int kdx = ystart+1; 8074 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 8075 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 8076 // jlong carry2 = (jlong)(tmp3 >>> 64); 8077 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 8078 // carry = (jlong)(tmp4 >>> 64); 8079 // z[kdx+idx+1] = (jlong)tmp3; 8080 // z[kdx+idx] = (jlong)tmp4; 8081 // } 8082 // idx += 2; 8083 // if (idx > 0) { 8084 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 8085 // z[kdx+idx] = (jlong)yz_idx1; 8086 // carry = (jlong)(yz_idx1 >>> 64); 8087 // } 8088 // 8089 8090 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 8091 8092 movl(jdx, idx); 8093 andl(jdx, 0xFFFFFFFC); 8094 shrl(jdx, 2); 8095 8096 bind(L_third_loop); 8097 subl(jdx, 1); 8098 jcc(Assembler::negative, L_third_loop_exit); 8099 subl(idx, 4); 8100 8101 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 8102 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 8103 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 8104 rorxq(yz_idx2, yz_idx2, 32); 8105 8106 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 8107 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 8108 8109 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 8110 rorxq(yz_idx1, yz_idx1, 32); 8111 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 8112 rorxq(yz_idx2, yz_idx2, 32); 8113 8114 if (VM_Version::supports_adx()) { 8115 adcxq(tmp3, carry); 8116 adoxq(tmp3, yz_idx1); 8117 8118 adcxq(tmp4, tmp); 8119 adoxq(tmp4, yz_idx2); 8120 8121 movl(carry, 0); // does not affect flags 8122 adcxq(carry2, carry); 8123 adoxq(carry2, carry); 8124 } else { 8125 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 8126 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 8127 } 8128 movq(carry, carry2); 8129 8130 movl(Address(z, idx, Address::times_4, 12), tmp3); 8131 shrq(tmp3, 32); 8132 movl(Address(z, idx, Address::times_4, 8), tmp3); 8133 8134 movl(Address(z, idx, Address::times_4, 4), tmp4); 8135 shrq(tmp4, 32); 8136 movl(Address(z, idx, Address::times_4, 0), tmp4); 8137 8138 jmp(L_third_loop); 8139 8140 bind (L_third_loop_exit); 8141 8142 andl (idx, 0x3); 8143 jcc(Assembler::zero, L_post_third_loop_done); 8144 8145 Label L_check_1; 8146 subl(idx, 2); 8147 jcc(Assembler::negative, L_check_1); 8148 8149 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 8150 rorxq(yz_idx1, yz_idx1, 32); 8151 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 8152 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 8153 rorxq(yz_idx2, yz_idx2, 32); 8154 8155 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 8156 8157 movl(Address(z, idx, Address::times_4, 4), tmp3); 8158 shrq(tmp3, 32); 8159 movl(Address(z, idx, Address::times_4, 0), tmp3); 8160 movq(carry, tmp4); 8161 8162 bind (L_check_1); 8163 addl (idx, 0x2); 8164 andl (idx, 0x1); 8165 subl(idx, 1); 8166 jcc(Assembler::negative, L_post_third_loop_done); 8167 movl(tmp4, Address(y, idx, Address::times_4, 0)); 8168 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 8169 movl(tmp4, Address(z, idx, Address::times_4, 0)); 8170 8171 add2_with_carry(carry2, tmp3, tmp4, carry); 8172 8173 movl(Address(z, idx, Address::times_4, 0), tmp3); 8174 shrq(tmp3, 32); 8175 8176 shlq(carry2, 32); 8177 orq(tmp3, carry2); 8178 movq(carry, tmp3); 8179 8180 bind(L_post_third_loop_done); 8181 } 8182 8183 /** 8184 * Code for BigInteger::multiplyToLen() intrinsic. 8185 * 8186 * rdi: x 8187 * rax: xlen 8188 * rsi: y 8189 * rcx: ylen 8190 * r8: z 8191 * r11: tmp0 8192 * r12: tmp1 8193 * r13: tmp2 8194 * r14: tmp3 8195 * r15: tmp4 8196 * rbx: tmp5 8197 * 8198 */ 8199 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0, 8200 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 8201 ShortBranchVerifier sbv(this); 8202 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 8203 8204 push(tmp0); 8205 push(tmp1); 8206 push(tmp2); 8207 push(tmp3); 8208 push(tmp4); 8209 push(tmp5); 8210 8211 push(xlen); 8212 8213 const Register idx = tmp1; 8214 const Register kdx = tmp2; 8215 const Register xstart = tmp3; 8216 8217 const Register y_idx = tmp4; 8218 const Register carry = tmp5; 8219 const Register product = xlen; 8220 const Register x_xstart = tmp0; 8221 8222 // First Loop. 8223 // 8224 // final static long LONG_MASK = 0xffffffffL; 8225 // int xstart = xlen - 1; 8226 // int ystart = ylen - 1; 8227 // long carry = 0; 8228 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 8229 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 8230 // z[kdx] = (int)product; 8231 // carry = product >>> 32; 8232 // } 8233 // z[xstart] = (int)carry; 8234 // 8235 8236 movl(idx, ylen); // idx = ylen; 8237 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen; 8238 xorq(carry, carry); // carry = 0; 8239 8240 Label L_done; 8241 8242 movl(xstart, xlen); 8243 decrementl(xstart); 8244 jcc(Assembler::negative, L_done); 8245 8246 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 8247 8248 Label L_second_loop; 8249 testl(kdx, kdx); 8250 jcc(Assembler::zero, L_second_loop); 8251 8252 Label L_carry; 8253 subl(kdx, 1); 8254 jcc(Assembler::zero, L_carry); 8255 8256 movl(Address(z, kdx, Address::times_4, 0), carry); 8257 shrq(carry, 32); 8258 subl(kdx, 1); 8259 8260 bind(L_carry); 8261 movl(Address(z, kdx, Address::times_4, 0), carry); 8262 8263 // Second and third (nested) loops. 8264 // 8265 // for (int i = xstart-1; i >= 0; i--) { // Second loop 8266 // carry = 0; 8267 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 8268 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 8269 // (z[k] & LONG_MASK) + carry; 8270 // z[k] = (int)product; 8271 // carry = product >>> 32; 8272 // } 8273 // z[i] = (int)carry; 8274 // } 8275 // 8276 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 8277 8278 const Register jdx = tmp1; 8279 8280 bind(L_second_loop); 8281 xorl(carry, carry); // carry = 0; 8282 movl(jdx, ylen); // j = ystart+1 8283 8284 subl(xstart, 1); // i = xstart-1; 8285 jcc(Assembler::negative, L_done); 8286 8287 push (z); 8288 8289 Label L_last_x; 8290 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 8291 subl(xstart, 1); // i = xstart-1; 8292 jcc(Assembler::negative, L_last_x); 8293 8294 if (UseBMI2Instructions) { 8295 movq(rdx, Address(x, xstart, Address::times_4, 0)); 8296 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 8297 } else { 8298 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 8299 rorq(x_xstart, 32); // convert big-endian to little-endian 8300 } 8301 8302 Label L_third_loop_prologue; 8303 bind(L_third_loop_prologue); 8304 8305 push (x); 8306 push (xstart); 8307 push (ylen); 8308 8309 8310 if (UseBMI2Instructions) { 8311 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 8312 } else { // !UseBMI2Instructions 8313 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 8314 } 8315 8316 pop(ylen); 8317 pop(xlen); 8318 pop(x); 8319 pop(z); 8320 8321 movl(tmp3, xlen); 8322 addl(tmp3, 1); 8323 movl(Address(z, tmp3, Address::times_4, 0), carry); 8324 subl(tmp3, 1); 8325 jccb(Assembler::negative, L_done); 8326 8327 shrq(carry, 32); 8328 movl(Address(z, tmp3, Address::times_4, 0), carry); 8329 jmp(L_second_loop); 8330 8331 // Next infrequent code is moved outside loops. 8332 bind(L_last_x); 8333 if (UseBMI2Instructions) { 8334 movl(rdx, Address(x, 0)); 8335 } else { 8336 movl(x_xstart, Address(x, 0)); 8337 } 8338 jmp(L_third_loop_prologue); 8339 8340 bind(L_done); 8341 8342 pop(xlen); 8343 8344 pop(tmp5); 8345 pop(tmp4); 8346 pop(tmp3); 8347 pop(tmp2); 8348 pop(tmp1); 8349 pop(tmp0); 8350 } 8351 8352 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, 8353 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ 8354 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); 8355 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; 8356 Label VECTOR8_TAIL, VECTOR4_TAIL; 8357 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; 8358 Label SAME_TILL_END, DONE; 8359 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; 8360 8361 //scale is in rcx in both Win64 and Unix 8362 ShortBranchVerifier sbv(this); 8363 8364 shlq(length); 8365 xorq(result, result); 8366 8367 if ((AVX3Threshold == 0) && (UseAVX > 2) && 8368 VM_Version::supports_avx512vlbw()) { 8369 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; 8370 8371 cmpq(length, 64); 8372 jcc(Assembler::less, VECTOR32_TAIL); 8373 8374 movq(tmp1, length); 8375 andq(tmp1, 0x3F); // tail count 8376 andq(length, ~(0x3F)); //vector count 8377 8378 bind(VECTOR64_LOOP); 8379 // AVX512 code to compare 64 byte vectors. 8380 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); 8381 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); 8382 kortestql(k7, k7); 8383 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch 8384 addq(result, 64); 8385 subq(length, 64); 8386 jccb(Assembler::notZero, VECTOR64_LOOP); 8387 8388 //bind(VECTOR64_TAIL); 8389 testq(tmp1, tmp1); 8390 jcc(Assembler::zero, SAME_TILL_END); 8391 8392 //bind(VECTOR64_TAIL); 8393 // AVX512 code to compare up to 63 byte vectors. 8394 mov64(tmp2, 0xFFFFFFFFFFFFFFFF); 8395 shlxq(tmp2, tmp2, tmp1); 8396 notq(tmp2); 8397 kmovql(k3, tmp2); 8398 8399 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit); 8400 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); 8401 8402 ktestql(k7, k3); 8403 jcc(Assembler::below, SAME_TILL_END); // not mismatch 8404 8405 bind(VECTOR64_NOT_EQUAL); 8406 kmovql(tmp1, k7); 8407 notq(tmp1); 8408 tzcntq(tmp1, tmp1); 8409 addq(result, tmp1); 8410 shrq(result); 8411 jmp(DONE); 8412 bind(VECTOR32_TAIL); 8413 } 8414 8415 cmpq(length, 8); 8416 jcc(Assembler::equal, VECTOR8_LOOP); 8417 jcc(Assembler::less, VECTOR4_TAIL); 8418 8419 if (UseAVX >= 2) { 8420 Label VECTOR16_TAIL, VECTOR32_LOOP; 8421 8422 cmpq(length, 16); 8423 jcc(Assembler::equal, VECTOR16_LOOP); 8424 jcc(Assembler::less, VECTOR8_LOOP); 8425 8426 cmpq(length, 32); 8427 jccb(Assembler::less, VECTOR16_TAIL); 8428 8429 subq(length, 32); 8430 bind(VECTOR32_LOOP); 8431 vmovdqu(rymm0, Address(obja, result)); 8432 vmovdqu(rymm1, Address(objb, result)); 8433 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); 8434 vptest(rymm2, rymm2); 8435 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found 8436 addq(result, 32); 8437 subq(length, 32); 8438 jcc(Assembler::greaterEqual, VECTOR32_LOOP); 8439 addq(length, 32); 8440 jcc(Assembler::equal, SAME_TILL_END); 8441 //falling through if less than 32 bytes left //close the branch here. 8442 8443 bind(VECTOR16_TAIL); 8444 cmpq(length, 16); 8445 jccb(Assembler::less, VECTOR8_TAIL); 8446 bind(VECTOR16_LOOP); 8447 movdqu(rymm0, Address(obja, result)); 8448 movdqu(rymm1, Address(objb, result)); 8449 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); 8450 ptest(rymm2, rymm2); 8451 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 8452 addq(result, 16); 8453 subq(length, 16); 8454 jcc(Assembler::equal, SAME_TILL_END); 8455 //falling through if less than 16 bytes left 8456 } else {//regular intrinsics 8457 8458 cmpq(length, 16); 8459 jccb(Assembler::less, VECTOR8_TAIL); 8460 8461 subq(length, 16); 8462 bind(VECTOR16_LOOP); 8463 movdqu(rymm0, Address(obja, result)); 8464 movdqu(rymm1, Address(objb, result)); 8465 pxor(rymm0, rymm1); 8466 ptest(rymm0, rymm0); 8467 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 8468 addq(result, 16); 8469 subq(length, 16); 8470 jccb(Assembler::greaterEqual, VECTOR16_LOOP); 8471 addq(length, 16); 8472 jcc(Assembler::equal, SAME_TILL_END); 8473 //falling through if less than 16 bytes left 8474 } 8475 8476 bind(VECTOR8_TAIL); 8477 cmpq(length, 8); 8478 jccb(Assembler::less, VECTOR4_TAIL); 8479 bind(VECTOR8_LOOP); 8480 movq(tmp1, Address(obja, result)); 8481 movq(tmp2, Address(objb, result)); 8482 xorq(tmp1, tmp2); 8483 testq(tmp1, tmp1); 8484 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found 8485 addq(result, 8); 8486 subq(length, 8); 8487 jcc(Assembler::equal, SAME_TILL_END); 8488 //falling through if less than 8 bytes left 8489 8490 bind(VECTOR4_TAIL); 8491 cmpq(length, 4); 8492 jccb(Assembler::less, BYTES_TAIL); 8493 bind(VECTOR4_LOOP); 8494 movl(tmp1, Address(obja, result)); 8495 xorl(tmp1, Address(objb, result)); 8496 testl(tmp1, tmp1); 8497 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found 8498 addq(result, 4); 8499 subq(length, 4); 8500 jcc(Assembler::equal, SAME_TILL_END); 8501 //falling through if less than 4 bytes left 8502 8503 bind(BYTES_TAIL); 8504 bind(BYTES_LOOP); 8505 load_unsigned_byte(tmp1, Address(obja, result)); 8506 load_unsigned_byte(tmp2, Address(objb, result)); 8507 xorl(tmp1, tmp2); 8508 testl(tmp1, tmp1); 8509 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 8510 decq(length); 8511 jcc(Assembler::zero, SAME_TILL_END); 8512 incq(result); 8513 load_unsigned_byte(tmp1, Address(obja, result)); 8514 load_unsigned_byte(tmp2, Address(objb, result)); 8515 xorl(tmp1, tmp2); 8516 testl(tmp1, tmp1); 8517 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 8518 decq(length); 8519 jcc(Assembler::zero, SAME_TILL_END); 8520 incq(result); 8521 load_unsigned_byte(tmp1, Address(obja, result)); 8522 load_unsigned_byte(tmp2, Address(objb, result)); 8523 xorl(tmp1, tmp2); 8524 testl(tmp1, tmp1); 8525 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 8526 jmp(SAME_TILL_END); 8527 8528 if (UseAVX >= 2) { 8529 bind(VECTOR32_NOT_EQUAL); 8530 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); 8531 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); 8532 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); 8533 vpmovmskb(tmp1, rymm0); 8534 bsfq(tmp1, tmp1); 8535 addq(result, tmp1); 8536 shrq(result); 8537 jmp(DONE); 8538 } 8539 8540 bind(VECTOR16_NOT_EQUAL); 8541 if (UseAVX >= 2) { 8542 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); 8543 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); 8544 pxor(rymm0, rymm2); 8545 } else { 8546 pcmpeqb(rymm2, rymm2); 8547 pxor(rymm0, rymm1); 8548 pcmpeqb(rymm0, rymm1); 8549 pxor(rymm0, rymm2); 8550 } 8551 pmovmskb(tmp1, rymm0); 8552 bsfq(tmp1, tmp1); 8553 addq(result, tmp1); 8554 shrq(result); 8555 jmpb(DONE); 8556 8557 bind(VECTOR8_NOT_EQUAL); 8558 bind(VECTOR4_NOT_EQUAL); 8559 bsfq(tmp1, tmp1); 8560 shrq(tmp1, 3); 8561 addq(result, tmp1); 8562 bind(BYTES_NOT_EQUAL); 8563 shrq(result); 8564 jmpb(DONE); 8565 8566 bind(SAME_TILL_END); 8567 mov64(result, -1); 8568 8569 bind(DONE); 8570 } 8571 8572 //Helper functions for square_to_len() 8573 8574 /** 8575 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 8576 * Preserves x and z and modifies rest of the registers. 8577 */ 8578 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8579 // Perform square and right shift by 1 8580 // Handle odd xlen case first, then for even xlen do the following 8581 // jlong carry = 0; 8582 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 8583 // huge_128 product = x[j:j+1] * x[j:j+1]; 8584 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 8585 // z[i+2:i+3] = (jlong)(product >>> 1); 8586 // carry = (jlong)product; 8587 // } 8588 8589 xorq(tmp5, tmp5); // carry 8590 xorq(rdxReg, rdxReg); 8591 xorl(tmp1, tmp1); // index for x 8592 xorl(tmp4, tmp4); // index for z 8593 8594 Label L_first_loop, L_first_loop_exit; 8595 8596 testl(xlen, 1); 8597 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 8598 8599 // Square and right shift by 1 the odd element using 32 bit multiply 8600 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 8601 imulq(raxReg, raxReg); 8602 shrq(raxReg, 1); 8603 adcq(tmp5, 0); 8604 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 8605 incrementl(tmp1); 8606 addl(tmp4, 2); 8607 8608 // Square and right shift by 1 the rest using 64 bit multiply 8609 bind(L_first_loop); 8610 cmpptr(tmp1, xlen); 8611 jccb(Assembler::equal, L_first_loop_exit); 8612 8613 // Square 8614 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 8615 rorq(raxReg, 32); // convert big-endian to little-endian 8616 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 8617 8618 // Right shift by 1 and save carry 8619 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 8620 rcrq(rdxReg, 1); 8621 rcrq(raxReg, 1); 8622 adcq(tmp5, 0); 8623 8624 // Store result in z 8625 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 8626 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 8627 8628 // Update indices for x and z 8629 addl(tmp1, 2); 8630 addl(tmp4, 4); 8631 jmp(L_first_loop); 8632 8633 bind(L_first_loop_exit); 8634 } 8635 8636 8637 /** 8638 * Perform the following multiply add operation using BMI2 instructions 8639 * carry:sum = sum + op1*op2 + carry 8640 * op2 should be in rdx 8641 * op2 is preserved, all other registers are modified 8642 */ 8643 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 8644 // assert op2 is rdx 8645 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 8646 addq(sum, carry); 8647 adcq(tmp2, 0); 8648 addq(sum, op1); 8649 adcq(tmp2, 0); 8650 movq(carry, tmp2); 8651 } 8652 8653 /** 8654 * Perform the following multiply add operation: 8655 * carry:sum = sum + op1*op2 + carry 8656 * Preserves op1, op2 and modifies rest of registers 8657 */ 8658 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 8659 // rdx:rax = op1 * op2 8660 movq(raxReg, op2); 8661 mulq(op1); 8662 8663 // rdx:rax = sum + carry + rdx:rax 8664 addq(sum, carry); 8665 adcq(rdxReg, 0); 8666 addq(sum, raxReg); 8667 adcq(rdxReg, 0); 8668 8669 // carry:sum = rdx:sum 8670 movq(carry, rdxReg); 8671 } 8672 8673 /** 8674 * Add 64 bit long carry into z[] with carry propagation. 8675 * Preserves z and carry register values and modifies rest of registers. 8676 * 8677 */ 8678 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 8679 Label L_fourth_loop, L_fourth_loop_exit; 8680 8681 movl(tmp1, 1); 8682 subl(zlen, 2); 8683 addq(Address(z, zlen, Address::times_4, 0), carry); 8684 8685 bind(L_fourth_loop); 8686 jccb(Assembler::carryClear, L_fourth_loop_exit); 8687 subl(zlen, 2); 8688 jccb(Assembler::negative, L_fourth_loop_exit); 8689 addq(Address(z, zlen, Address::times_4, 0), tmp1); 8690 jmp(L_fourth_loop); 8691 bind(L_fourth_loop_exit); 8692 } 8693 8694 /** 8695 * Shift z[] left by 1 bit. 8696 * Preserves x, len, z and zlen registers and modifies rest of the registers. 8697 * 8698 */ 8699 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 8700 8701 Label L_fifth_loop, L_fifth_loop_exit; 8702 8703 // Fifth loop 8704 // Perform primitiveLeftShift(z, zlen, 1) 8705 8706 const Register prev_carry = tmp1; 8707 const Register new_carry = tmp4; 8708 const Register value = tmp2; 8709 const Register zidx = tmp3; 8710 8711 // int zidx, carry; 8712 // long value; 8713 // carry = 0; 8714 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 8715 // (carry:value) = (z[i] << 1) | carry ; 8716 // z[i] = value; 8717 // } 8718 8719 movl(zidx, zlen); 8720 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 8721 8722 bind(L_fifth_loop); 8723 decl(zidx); // Use decl to preserve carry flag 8724 decl(zidx); 8725 jccb(Assembler::negative, L_fifth_loop_exit); 8726 8727 if (UseBMI2Instructions) { 8728 movq(value, Address(z, zidx, Address::times_4, 0)); 8729 rclq(value, 1); 8730 rorxq(value, value, 32); 8731 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 8732 } 8733 else { 8734 // clear new_carry 8735 xorl(new_carry, new_carry); 8736 8737 // Shift z[i] by 1, or in previous carry and save new carry 8738 movq(value, Address(z, zidx, Address::times_4, 0)); 8739 shlq(value, 1); 8740 adcl(new_carry, 0); 8741 8742 orq(value, prev_carry); 8743 rorq(value, 0x20); 8744 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 8745 8746 // Set previous carry = new carry 8747 movl(prev_carry, new_carry); 8748 } 8749 jmp(L_fifth_loop); 8750 8751 bind(L_fifth_loop_exit); 8752 } 8753 8754 8755 /** 8756 * Code for BigInteger::squareToLen() intrinsic 8757 * 8758 * rdi: x 8759 * rsi: len 8760 * r8: z 8761 * rcx: zlen 8762 * r12: tmp1 8763 * r13: tmp2 8764 * r14: tmp3 8765 * r15: tmp4 8766 * rbx: tmp5 8767 * 8768 */ 8769 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8770 8771 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply; 8772 push(tmp1); 8773 push(tmp2); 8774 push(tmp3); 8775 push(tmp4); 8776 push(tmp5); 8777 8778 // First loop 8779 // Store the squares, right shifted one bit (i.e., divided by 2). 8780 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 8781 8782 // Add in off-diagonal sums. 8783 // 8784 // Second, third (nested) and fourth loops. 8785 // zlen +=2; 8786 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 8787 // carry = 0; 8788 // long op2 = x[xidx:xidx+1]; 8789 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 8790 // k -= 2; 8791 // long op1 = x[j:j+1]; 8792 // long sum = z[k:k+1]; 8793 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 8794 // z[k:k+1] = sum; 8795 // } 8796 // add_one_64(z, k, carry, tmp_regs); 8797 // } 8798 8799 const Register carry = tmp5; 8800 const Register sum = tmp3; 8801 const Register op1 = tmp4; 8802 Register op2 = tmp2; 8803 8804 push(zlen); 8805 push(len); 8806 addl(zlen,2); 8807 bind(L_second_loop); 8808 xorq(carry, carry); 8809 subl(zlen, 4); 8810 subl(len, 2); 8811 push(zlen); 8812 push(len); 8813 cmpl(len, 0); 8814 jccb(Assembler::lessEqual, L_second_loop_exit); 8815 8816 // Multiply an array by one 64 bit long. 8817 if (UseBMI2Instructions) { 8818 op2 = rdxReg; 8819 movq(op2, Address(x, len, Address::times_4, 0)); 8820 rorxq(op2, op2, 32); 8821 } 8822 else { 8823 movq(op2, Address(x, len, Address::times_4, 0)); 8824 rorq(op2, 32); 8825 } 8826 8827 bind(L_third_loop); 8828 decrementl(len); 8829 jccb(Assembler::negative, L_third_loop_exit); 8830 decrementl(len); 8831 jccb(Assembler::negative, L_last_x); 8832 8833 movq(op1, Address(x, len, Address::times_4, 0)); 8834 rorq(op1, 32); 8835 8836 bind(L_multiply); 8837 subl(zlen, 2); 8838 movq(sum, Address(z, zlen, Address::times_4, 0)); 8839 8840 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 8841 if (UseBMI2Instructions) { 8842 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 8843 } 8844 else { 8845 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8846 } 8847 8848 movq(Address(z, zlen, Address::times_4, 0), sum); 8849 8850 jmp(L_third_loop); 8851 bind(L_third_loop_exit); 8852 8853 // Fourth loop 8854 // Add 64 bit long carry into z with carry propagation. 8855 // Uses offsetted zlen. 8856 add_one_64(z, zlen, carry, tmp1); 8857 8858 pop(len); 8859 pop(zlen); 8860 jmp(L_second_loop); 8861 8862 // Next infrequent code is moved outside loops. 8863 bind(L_last_x); 8864 movl(op1, Address(x, 0)); 8865 jmp(L_multiply); 8866 8867 bind(L_second_loop_exit); 8868 pop(len); 8869 pop(zlen); 8870 pop(len); 8871 pop(zlen); 8872 8873 // Fifth loop 8874 // Shift z left 1 bit. 8875 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 8876 8877 // z[zlen-1] |= x[len-1] & 1; 8878 movl(tmp3, Address(x, len, Address::times_4, -4)); 8879 andl(tmp3, 1); 8880 orl(Address(z, zlen, Address::times_4, -4), tmp3); 8881 8882 pop(tmp5); 8883 pop(tmp4); 8884 pop(tmp3); 8885 pop(tmp2); 8886 pop(tmp1); 8887 } 8888 8889 /** 8890 * Helper function for mul_add() 8891 * Multiply the in[] by int k and add to out[] starting at offset offs using 8892 * 128 bit by 32 bit multiply and return the carry in tmp5. 8893 * Only quad int aligned length of in[] is operated on in this function. 8894 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 8895 * This function preserves out, in and k registers. 8896 * len and offset point to the appropriate index in "in" & "out" correspondingly 8897 * tmp5 has the carry. 8898 * other registers are temporary and are modified. 8899 * 8900 */ 8901 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 8902 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 8903 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8904 8905 Label L_first_loop, L_first_loop_exit; 8906 8907 movl(tmp1, len); 8908 shrl(tmp1, 2); 8909 8910 bind(L_first_loop); 8911 subl(tmp1, 1); 8912 jccb(Assembler::negative, L_first_loop_exit); 8913 8914 subl(len, 4); 8915 subl(offset, 4); 8916 8917 Register op2 = tmp2; 8918 const Register sum = tmp3; 8919 const Register op1 = tmp4; 8920 const Register carry = tmp5; 8921 8922 if (UseBMI2Instructions) { 8923 op2 = rdxReg; 8924 } 8925 8926 movq(op1, Address(in, len, Address::times_4, 8)); 8927 rorq(op1, 32); 8928 movq(sum, Address(out, offset, Address::times_4, 8)); 8929 rorq(sum, 32); 8930 if (UseBMI2Instructions) { 8931 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8932 } 8933 else { 8934 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8935 } 8936 // Store back in big endian from little endian 8937 rorq(sum, 0x20); 8938 movq(Address(out, offset, Address::times_4, 8), sum); 8939 8940 movq(op1, Address(in, len, Address::times_4, 0)); 8941 rorq(op1, 32); 8942 movq(sum, Address(out, offset, Address::times_4, 0)); 8943 rorq(sum, 32); 8944 if (UseBMI2Instructions) { 8945 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8946 } 8947 else { 8948 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8949 } 8950 // Store back in big endian from little endian 8951 rorq(sum, 0x20); 8952 movq(Address(out, offset, Address::times_4, 0), sum); 8953 8954 jmp(L_first_loop); 8955 bind(L_first_loop_exit); 8956 } 8957 8958 /** 8959 * Code for BigInteger::mulAdd() intrinsic 8960 * 8961 * rdi: out 8962 * rsi: in 8963 * r11: offs (out.length - offset) 8964 * rcx: len 8965 * r8: k 8966 * r12: tmp1 8967 * r13: tmp2 8968 * r14: tmp3 8969 * r15: tmp4 8970 * rbx: tmp5 8971 * Multiply the in[] by word k and add to out[], return the carry in rax 8972 */ 8973 void MacroAssembler::mul_add(Register out, Register in, Register offs, 8974 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 8975 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8976 8977 Label L_carry, L_last_in, L_done; 8978 8979 // carry = 0; 8980 // for (int j=len-1; j >= 0; j--) { 8981 // long product = (in[j] & LONG_MASK) * kLong + 8982 // (out[offs] & LONG_MASK) + carry; 8983 // out[offs--] = (int)product; 8984 // carry = product >>> 32; 8985 // } 8986 // 8987 push(tmp1); 8988 push(tmp2); 8989 push(tmp3); 8990 push(tmp4); 8991 push(tmp5); 8992 8993 Register op2 = tmp2; 8994 const Register sum = tmp3; 8995 const Register op1 = tmp4; 8996 const Register carry = tmp5; 8997 8998 if (UseBMI2Instructions) { 8999 op2 = rdxReg; 9000 movl(op2, k); 9001 } 9002 else { 9003 movl(op2, k); 9004 } 9005 9006 xorq(carry, carry); 9007 9008 //First loop 9009 9010 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 9011 //The carry is in tmp5 9012 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 9013 9014 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 9015 decrementl(len); 9016 jccb(Assembler::negative, L_carry); 9017 decrementl(len); 9018 jccb(Assembler::negative, L_last_in); 9019 9020 movq(op1, Address(in, len, Address::times_4, 0)); 9021 rorq(op1, 32); 9022 9023 subl(offs, 2); 9024 movq(sum, Address(out, offs, Address::times_4, 0)); 9025 rorq(sum, 32); 9026 9027 if (UseBMI2Instructions) { 9028 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 9029 } 9030 else { 9031 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 9032 } 9033 9034 // Store back in big endian from little endian 9035 rorq(sum, 0x20); 9036 movq(Address(out, offs, Address::times_4, 0), sum); 9037 9038 testl(len, len); 9039 jccb(Assembler::zero, L_carry); 9040 9041 //Multiply the last in[] entry, if any 9042 bind(L_last_in); 9043 movl(op1, Address(in, 0)); 9044 movl(sum, Address(out, offs, Address::times_4, -4)); 9045 9046 movl(raxReg, k); 9047 mull(op1); //tmp4 * eax -> edx:eax 9048 addl(sum, carry); 9049 adcl(rdxReg, 0); 9050 addl(sum, raxReg); 9051 adcl(rdxReg, 0); 9052 movl(carry, rdxReg); 9053 9054 movl(Address(out, offs, Address::times_4, -4), sum); 9055 9056 bind(L_carry); 9057 //return tmp5/carry as carry in rax 9058 movl(rax, carry); 9059 9060 bind(L_done); 9061 pop(tmp5); 9062 pop(tmp4); 9063 pop(tmp3); 9064 pop(tmp2); 9065 pop(tmp1); 9066 } 9067 #endif 9068 9069 /** 9070 * Emits code to update CRC-32 with a byte value according to constants in table 9071 * 9072 * @param [in,out]crc Register containing the crc. 9073 * @param [in]val Register containing the byte to fold into the CRC. 9074 * @param [in]table Register containing the table of crc constants. 9075 * 9076 * uint32_t crc; 9077 * val = crc_table[(val ^ crc) & 0xFF]; 9078 * crc = val ^ (crc >> 8); 9079 * 9080 */ 9081 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 9082 xorl(val, crc); 9083 andl(val, 0xFF); 9084 shrl(crc, 8); // unsigned shift 9085 xorl(crc, Address(table, val, Address::times_4, 0)); 9086 } 9087 9088 /** 9089 * Fold 128-bit data chunk 9090 */ 9091 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 9092 if (UseAVX > 0) { 9093 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 9094 vpclmulldq(xcrc, xK, xcrc); // [63:0] 9095 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); 9096 pxor(xcrc, xtmp); 9097 } else { 9098 movdqa(xtmp, xcrc); 9099 pclmulhdq(xtmp, xK); // [123:64] 9100 pclmulldq(xcrc, xK); // [63:0] 9101 pxor(xcrc, xtmp); 9102 movdqu(xtmp, Address(buf, offset)); 9103 pxor(xcrc, xtmp); 9104 } 9105 } 9106 9107 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 9108 if (UseAVX > 0) { 9109 vpclmulhdq(xtmp, xK, xcrc); 9110 vpclmulldq(xcrc, xK, xcrc); 9111 pxor(xcrc, xbuf); 9112 pxor(xcrc, xtmp); 9113 } else { 9114 movdqa(xtmp, xcrc); 9115 pclmulhdq(xtmp, xK); 9116 pclmulldq(xcrc, xK); 9117 pxor(xcrc, xbuf); 9118 pxor(xcrc, xtmp); 9119 } 9120 } 9121 9122 /** 9123 * 8-bit folds to compute 32-bit CRC 9124 * 9125 * uint64_t xcrc; 9126 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 9127 */ 9128 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 9129 movdl(tmp, xcrc); 9130 andl(tmp, 0xFF); 9131 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 9132 psrldq(xcrc, 1); // unsigned shift one byte 9133 pxor(xcrc, xtmp); 9134 } 9135 9136 /** 9137 * uint32_t crc; 9138 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 9139 */ 9140 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 9141 movl(tmp, crc); 9142 andl(tmp, 0xFF); 9143 shrl(crc, 8); 9144 xorl(crc, Address(table, tmp, Address::times_4, 0)); 9145 } 9146 9147 /** 9148 * @param crc register containing existing CRC (32-bit) 9149 * @param buf register pointing to input byte buffer (byte*) 9150 * @param len register containing number of bytes 9151 * @param table register that will contain address of CRC table 9152 * @param tmp scratch register 9153 */ 9154 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 9155 assert_different_registers(crc, buf, len, table, tmp, rax); 9156 9157 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 9158 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 9159 9160 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 9161 // context for the registers used, where all instructions below are using 128-bit mode 9162 // On EVEX without VL and BW, these instructions will all be AVX. 9163 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 9164 notl(crc); // ~crc 9165 cmpl(len, 16); 9166 jcc(Assembler::less, L_tail); 9167 9168 // Align buffer to 16 bytes 9169 movl(tmp, buf); 9170 andl(tmp, 0xF); 9171 jccb(Assembler::zero, L_aligned); 9172 subl(tmp, 16); 9173 addl(len, tmp); 9174 9175 align(4); 9176 BIND(L_align_loop); 9177 movsbl(rax, Address(buf, 0)); // load byte with sign extension 9178 update_byte_crc32(crc, rax, table); 9179 increment(buf); 9180 incrementl(tmp); 9181 jccb(Assembler::less, L_align_loop); 9182 9183 BIND(L_aligned); 9184 movl(tmp, len); // save 9185 shrl(len, 4); 9186 jcc(Assembler::zero, L_tail_restore); 9187 9188 // Fold crc into first bytes of vector 9189 movdqa(xmm1, Address(buf, 0)); 9190 movdl(rax, xmm1); 9191 xorl(crc, rax); 9192 if (VM_Version::supports_sse4_1()) { 9193 pinsrd(xmm1, crc, 0); 9194 } else { 9195 pinsrw(xmm1, crc, 0); 9196 shrl(crc, 16); 9197 pinsrw(xmm1, crc, 1); 9198 } 9199 addptr(buf, 16); 9200 subl(len, 4); // len > 0 9201 jcc(Assembler::less, L_fold_tail); 9202 9203 movdqa(xmm2, Address(buf, 0)); 9204 movdqa(xmm3, Address(buf, 16)); 9205 movdqa(xmm4, Address(buf, 32)); 9206 addptr(buf, 48); 9207 subl(len, 3); 9208 jcc(Assembler::lessEqual, L_fold_512b); 9209 9210 // Fold total 512 bits of polynomial on each iteration, 9211 // 128 bits per each of 4 parallel streams. 9212 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1); 9213 9214 align32(); 9215 BIND(L_fold_512b_loop); 9216 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 9217 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 9218 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 9219 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 9220 addptr(buf, 64); 9221 subl(len, 4); 9222 jcc(Assembler::greater, L_fold_512b_loop); 9223 9224 // Fold 512 bits to 128 bits. 9225 BIND(L_fold_512b); 9226 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1); 9227 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 9228 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 9229 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 9230 9231 // Fold the rest of 128 bits data chunks 9232 BIND(L_fold_tail); 9233 addl(len, 3); 9234 jccb(Assembler::lessEqual, L_fold_128b); 9235 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1); 9236 9237 BIND(L_fold_tail_loop); 9238 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 9239 addptr(buf, 16); 9240 decrementl(len); 9241 jccb(Assembler::greater, L_fold_tail_loop); 9242 9243 // Fold 128 bits in xmm1 down into 32 bits in crc register. 9244 BIND(L_fold_128b); 9245 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1); 9246 if (UseAVX > 0) { 9247 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 9248 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); 9249 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 9250 } else { 9251 movdqa(xmm2, xmm0); 9252 pclmulqdq(xmm2, xmm1, 0x1); 9253 movdqa(xmm3, xmm0); 9254 pand(xmm3, xmm2); 9255 pclmulqdq(xmm0, xmm3, 0x1); 9256 } 9257 psrldq(xmm1, 8); 9258 psrldq(xmm2, 4); 9259 pxor(xmm0, xmm1); 9260 pxor(xmm0, xmm2); 9261 9262 // 8 8-bit folds to compute 32-bit CRC. 9263 for (int j = 0; j < 4; j++) { 9264 fold_8bit_crc32(xmm0, table, xmm1, rax); 9265 } 9266 movdl(crc, xmm0); // mov 32 bits to general register 9267 for (int j = 0; j < 4; j++) { 9268 fold_8bit_crc32(crc, table, rax); 9269 } 9270 9271 BIND(L_tail_restore); 9272 movl(len, tmp); // restore 9273 BIND(L_tail); 9274 andl(len, 0xf); 9275 jccb(Assembler::zero, L_exit); 9276 9277 // Fold the rest of bytes 9278 align(4); 9279 BIND(L_tail_loop); 9280 movsbl(rax, Address(buf, 0)); // load byte with sign extension 9281 update_byte_crc32(crc, rax, table); 9282 increment(buf); 9283 decrementl(len); 9284 jccb(Assembler::greater, L_tail_loop); 9285 9286 BIND(L_exit); 9287 notl(crc); // ~c 9288 } 9289 9290 #ifdef _LP64 9291 // Helper function for AVX 512 CRC32 9292 // Fold 512-bit data chunks 9293 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, 9294 Register pos, int offset) { 9295 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit); 9296 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64] 9297 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0] 9298 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */); 9299 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */); 9300 } 9301 9302 // Helper function for AVX 512 CRC32 9303 // Compute CRC32 for < 256B buffers 9304 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos, 9305 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, 9306 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { 9307 9308 Label L_less_than_32, L_exact_16_left, L_less_than_16_left; 9309 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left; 9310 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2; 9311 9312 // check if there is enough buffer to be able to fold 16B at a time 9313 cmpl(len, 32); 9314 jcc(Assembler::less, L_less_than_32); 9315 9316 // if there is, load the constants 9317 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10 9318 movdl(xmm0, crc); // get the initial crc value 9319 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 9320 pxor(xmm7, xmm0); 9321 9322 // update the buffer pointer 9323 addl(pos, 16); 9324 //update the counter.subtract 32 instead of 16 to save one instruction from the loop 9325 subl(len, 32); 9326 jmp(L_16B_reduction_loop); 9327 9328 bind(L_less_than_32); 9329 //mov initial crc to the return value. this is necessary for zero - length buffers. 9330 movl(rax, crc); 9331 testl(len, len); 9332 jcc(Assembler::equal, L_cleanup); 9333 9334 movdl(xmm0, crc); //get the initial crc value 9335 9336 cmpl(len, 16); 9337 jcc(Assembler::equal, L_exact_16_left); 9338 jcc(Assembler::less, L_less_than_16_left); 9339 9340 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 9341 pxor(xmm7, xmm0); //xor the initial crc value 9342 addl(pos, 16); 9343 subl(len, 16); 9344 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10 9345 jmp(L_get_last_two_xmms); 9346 9347 bind(L_less_than_16_left); 9348 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first. 9349 pxor(xmm1, xmm1); 9350 movptr(tmp1, rsp); 9351 movdqu(Address(tmp1, 0 * 16), xmm1); 9352 9353 cmpl(len, 4); 9354 jcc(Assembler::less, L_only_less_than_4); 9355 9356 //backup the counter value 9357 movl(tmp2, len); 9358 cmpl(len, 8); 9359 jcc(Assembler::less, L_less_than_8_left); 9360 9361 //load 8 Bytes 9362 movq(rax, Address(buf, pos, Address::times_1, 0 * 16)); 9363 movq(Address(tmp1, 0 * 16), rax); 9364 addptr(tmp1, 8); 9365 subl(len, 8); 9366 addl(pos, 8); 9367 9368 bind(L_less_than_8_left); 9369 cmpl(len, 4); 9370 jcc(Assembler::less, L_less_than_4_left); 9371 9372 //load 4 Bytes 9373 movl(rax, Address(buf, pos, Address::times_1, 0)); 9374 movl(Address(tmp1, 0 * 16), rax); 9375 addptr(tmp1, 4); 9376 subl(len, 4); 9377 addl(pos, 4); 9378 9379 bind(L_less_than_4_left); 9380 cmpl(len, 2); 9381 jcc(Assembler::less, L_less_than_2_left); 9382 9383 // load 2 Bytes 9384 movw(rax, Address(buf, pos, Address::times_1, 0)); 9385 movl(Address(tmp1, 0 * 16), rax); 9386 addptr(tmp1, 2); 9387 subl(len, 2); 9388 addl(pos, 2); 9389 9390 bind(L_less_than_2_left); 9391 cmpl(len, 1); 9392 jcc(Assembler::less, L_zero_left); 9393 9394 // load 1 Byte 9395 movb(rax, Address(buf, pos, Address::times_1, 0)); 9396 movb(Address(tmp1, 0 * 16), rax); 9397 9398 bind(L_zero_left); 9399 movdqu(xmm7, Address(rsp, 0)); 9400 pxor(xmm7, xmm0); //xor the initial crc value 9401 9402 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 9403 movdqu(xmm0, Address(rax, tmp2)); 9404 pshufb(xmm7, xmm0); 9405 jmp(L_128_done); 9406 9407 bind(L_exact_16_left); 9408 movdqu(xmm7, Address(buf, pos, Address::times_1, 0)); 9409 pxor(xmm7, xmm0); //xor the initial crc value 9410 jmp(L_128_done); 9411 9412 bind(L_only_less_than_4); 9413 cmpl(len, 3); 9414 jcc(Assembler::less, L_only_less_than_3); 9415 9416 // load 3 Bytes 9417 movb(rax, Address(buf, pos, Address::times_1, 0)); 9418 movb(Address(tmp1, 0), rax); 9419 9420 movb(rax, Address(buf, pos, Address::times_1, 1)); 9421 movb(Address(tmp1, 1), rax); 9422 9423 movb(rax, Address(buf, pos, Address::times_1, 2)); 9424 movb(Address(tmp1, 2), rax); 9425 9426 movdqu(xmm7, Address(rsp, 0)); 9427 pxor(xmm7, xmm0); //xor the initial crc value 9428 9429 pslldq(xmm7, 0x5); 9430 jmp(L_barrett); 9431 bind(L_only_less_than_3); 9432 cmpl(len, 2); 9433 jcc(Assembler::less, L_only_less_than_2); 9434 9435 // load 2 Bytes 9436 movb(rax, Address(buf, pos, Address::times_1, 0)); 9437 movb(Address(tmp1, 0), rax); 9438 9439 movb(rax, Address(buf, pos, Address::times_1, 1)); 9440 movb(Address(tmp1, 1), rax); 9441 9442 movdqu(xmm7, Address(rsp, 0)); 9443 pxor(xmm7, xmm0); //xor the initial crc value 9444 9445 pslldq(xmm7, 0x6); 9446 jmp(L_barrett); 9447 9448 bind(L_only_less_than_2); 9449 //load 1 Byte 9450 movb(rax, Address(buf, pos, Address::times_1, 0)); 9451 movb(Address(tmp1, 0), rax); 9452 9453 movdqu(xmm7, Address(rsp, 0)); 9454 pxor(xmm7, xmm0); //xor the initial crc value 9455 9456 pslldq(xmm7, 0x7); 9457 } 9458 9459 /** 9460 * Compute CRC32 using AVX512 instructions 9461 * param crc register containing existing CRC (32-bit) 9462 * param buf register pointing to input byte buffer (byte*) 9463 * param len register containing number of bytes 9464 * param table address of crc or crc32c table 9465 * param tmp1 scratch register 9466 * param tmp2 scratch register 9467 * return rax result register 9468 * 9469 * This routine is identical for crc32c with the exception of the precomputed constant 9470 * table which will be passed as the table argument. The calculation steps are 9471 * the same for both variants. 9472 */ 9473 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) { 9474 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12); 9475 9476 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 9477 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 9478 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop; 9479 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop; 9480 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup; 9481 9482 const Register pos = r12; 9483 push(r12); 9484 subptr(rsp, 16 * 2 + 8); 9485 9486 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 9487 // context for the registers used, where all instructions below are using 128-bit mode 9488 // On EVEX without VL and BW, these instructions will all be AVX. 9489 movl(pos, 0); 9490 9491 // check if smaller than 256B 9492 cmpl(len, 256); 9493 jcc(Assembler::less, L_less_than_256); 9494 9495 // load the initial crc value 9496 movdl(xmm10, crc); 9497 9498 // receive the initial 64B data, xor the initial crc value 9499 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); 9500 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); 9501 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); 9502 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 9503 9504 subl(len, 256); 9505 cmpl(len, 256); 9506 jcc(Assembler::less, L_fold_128_B_loop); 9507 9508 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); 9509 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); 9510 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 9511 subl(len, 256); 9512 9513 bind(L_fold_256_B_loop); 9514 addl(pos, 256); 9515 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64); 9516 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64); 9517 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64); 9518 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64); 9519 9520 subl(len, 256); 9521 jcc(Assembler::greaterEqual, L_fold_256_B_loop); 9522 9523 // Fold 256 into 128 9524 addl(pos, 256); 9525 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit); 9526 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit); 9527 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC 9528 9529 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit); 9530 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit); 9531 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC 9532 9533 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit); 9534 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit); 9535 9536 addl(len, 128); 9537 jmp(L_fold_128_B_register); 9538 9539 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop 9540 // loop will fold 128B at a time until we have 128 + y Bytes of buffer 9541 9542 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel 9543 bind(L_fold_128_B_loop); 9544 addl(pos, 128); 9545 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64); 9546 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64); 9547 9548 subl(len, 128); 9549 jcc(Assembler::greaterEqual, L_fold_128_B_loop); 9550 9551 addl(pos, 128); 9552 9553 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 9554 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 9555 bind(L_fold_128_B_register); 9556 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 9557 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 9558 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); 9559 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); 9560 // save last that has no multiplicand 9561 vextracti64x2(xmm7, xmm4, 3); 9562 9563 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); 9564 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); 9565 // Needed later in reduction loop 9566 movdqu(xmm10, Address(table, 1 * 16)); 9567 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC 9568 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC 9569 9570 // Swap 1,0,3,2 - 01 00 11 10 9571 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit); 9572 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit); 9573 vextracti128(xmm5, xmm8, 1); 9574 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit); 9575 9576 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop 9577 // instead of a cmp instruction, we use the negative flag with the jl instruction 9578 addl(len, 128 - 16); 9579 jcc(Assembler::less, L_final_reduction_for_128); 9580 9581 bind(L_16B_reduction_loop); 9582 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 9583 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 9584 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 9585 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); 9586 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 9587 addl(pos, 16); 9588 subl(len, 16); 9589 jcc(Assembler::greaterEqual, L_16B_reduction_loop); 9590 9591 bind(L_final_reduction_for_128); 9592 addl(len, 16); 9593 jcc(Assembler::equal, L_128_done); 9594 9595 bind(L_get_last_two_xmms); 9596 movdqu(xmm2, xmm7); 9597 addl(pos, len); 9598 movdqu(xmm1, Address(buf, pos, Address::times_1, -16)); 9599 subl(pos, len); 9600 9601 // get rid of the extra data that was loaded before 9602 // load the shift constant 9603 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 9604 movdqu(xmm0, Address(rax, len)); 9605 addl(rax, len); 9606 9607 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 9608 //Change mask to 512 9609 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2); 9610 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); 9611 9612 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); 9613 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 9614 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 9615 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 9616 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); 9617 9618 bind(L_128_done); 9619 // compute crc of a 128-bit value 9620 movdqu(xmm10, Address(table, 3 * 16)); 9621 movdqu(xmm0, xmm7); 9622 9623 // 64b fold 9624 vpclmulqdq(xmm7, xmm7, xmm10, 0x0); 9625 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit); 9626 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 9627 9628 // 32b fold 9629 movdqu(xmm0, xmm7); 9630 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit); 9631 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 9632 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 9633 jmp(L_barrett); 9634 9635 bind(L_less_than_256); 9636 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); 9637 9638 //barrett reduction 9639 bind(L_barrett); 9640 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); 9641 movdqu(xmm1, xmm7); 9642 movdqu(xmm2, xmm7); 9643 movdqu(xmm10, Address(table, 4 * 16)); 9644 9645 pclmulqdq(xmm7, xmm10, 0x0); 9646 pxor(xmm7, xmm2); 9647 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2); 9648 movdqu(xmm2, xmm7); 9649 pclmulqdq(xmm7, xmm10, 0x10); 9650 pxor(xmm7, xmm2); 9651 pxor(xmm7, xmm1); 9652 pextrd(crc, xmm7, 2); 9653 9654 bind(L_cleanup); 9655 addptr(rsp, 16 * 2 + 8); 9656 pop(r12); 9657 } 9658 9659 // S. Gueron / Information Processing Letters 112 (2012) 184 9660 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. 9661 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. 9662 // Output: the 64-bit carry-less product of B * CONST 9663 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, 9664 Register tmp1, Register tmp2, Register tmp3) { 9665 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 9666 if (n > 0) { 9667 addq(tmp3, n * 256 * 8); 9668 } 9669 // Q1 = TABLEExt[n][B & 0xFF]; 9670 movl(tmp1, in); 9671 andl(tmp1, 0x000000FF); 9672 shll(tmp1, 3); 9673 addq(tmp1, tmp3); 9674 movq(tmp1, Address(tmp1, 0)); 9675 9676 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 9677 movl(tmp2, in); 9678 shrl(tmp2, 8); 9679 andl(tmp2, 0x000000FF); 9680 shll(tmp2, 3); 9681 addq(tmp2, tmp3); 9682 movq(tmp2, Address(tmp2, 0)); 9683 9684 shlq(tmp2, 8); 9685 xorq(tmp1, tmp2); 9686 9687 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 9688 movl(tmp2, in); 9689 shrl(tmp2, 16); 9690 andl(tmp2, 0x000000FF); 9691 shll(tmp2, 3); 9692 addq(tmp2, tmp3); 9693 movq(tmp2, Address(tmp2, 0)); 9694 9695 shlq(tmp2, 16); 9696 xorq(tmp1, tmp2); 9697 9698 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 9699 shrl(in, 24); 9700 andl(in, 0x000000FF); 9701 shll(in, 3); 9702 addq(in, tmp3); 9703 movq(in, Address(in, 0)); 9704 9705 shlq(in, 24); 9706 xorq(in, tmp1); 9707 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 9708 } 9709 9710 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 9711 Register in_out, 9712 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 9713 XMMRegister w_xtmp2, 9714 Register tmp1, 9715 Register n_tmp2, Register n_tmp3) { 9716 if (is_pclmulqdq_supported) { 9717 movdl(w_xtmp1, in_out); // modified blindly 9718 9719 movl(tmp1, const_or_pre_comp_const_index); 9720 movdl(w_xtmp2, tmp1); 9721 pclmulqdq(w_xtmp1, w_xtmp2, 0); 9722 9723 movdq(in_out, w_xtmp1); 9724 } else { 9725 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); 9726 } 9727 } 9728 9729 // Recombination Alternative 2: No bit-reflections 9730 // T1 = (CRC_A * U1) << 1 9731 // T2 = (CRC_B * U2) << 1 9732 // C1 = T1 >> 32 9733 // C2 = T2 >> 32 9734 // T1 = T1 & 0xFFFFFFFF 9735 // T2 = T2 & 0xFFFFFFFF 9736 // T1 = CRC32(0, T1) 9737 // T2 = CRC32(0, T2) 9738 // C1 = C1 ^ T1 9739 // C2 = C2 ^ T2 9740 // CRC = C1 ^ C2 ^ CRC_C 9741 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 9742 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9743 Register tmp1, Register tmp2, 9744 Register n_tmp3) { 9745 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 9746 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 9747 shlq(in_out, 1); 9748 movl(tmp1, in_out); 9749 shrq(in_out, 32); 9750 xorl(tmp2, tmp2); 9751 crc32(tmp2, tmp1, 4); 9752 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here 9753 shlq(in1, 1); 9754 movl(tmp1, in1); 9755 shrq(in1, 32); 9756 xorl(tmp2, tmp2); 9757 crc32(tmp2, tmp1, 4); 9758 xorl(in1, tmp2); 9759 xorl(in_out, in1); 9760 xorl(in_out, in2); 9761 } 9762 9763 // Set N to predefined value 9764 // Subtract from a length of a buffer 9765 // execute in a loop: 9766 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 9767 // for i = 1 to N do 9768 // CRC_A = CRC32(CRC_A, A[i]) 9769 // CRC_B = CRC32(CRC_B, B[i]) 9770 // CRC_C = CRC32(CRC_C, C[i]) 9771 // end for 9772 // Recombine 9773 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 9774 Register in_out1, Register in_out2, Register in_out3, 9775 Register tmp1, Register tmp2, Register tmp3, 9776 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9777 Register tmp4, Register tmp5, 9778 Register n_tmp6) { 9779 Label L_processPartitions; 9780 Label L_processPartition; 9781 Label L_exit; 9782 9783 bind(L_processPartitions); 9784 cmpl(in_out1, 3 * size); 9785 jcc(Assembler::less, L_exit); 9786 xorl(tmp1, tmp1); 9787 xorl(tmp2, tmp2); 9788 movq(tmp3, in_out2); 9789 addq(tmp3, size); 9790 9791 bind(L_processPartition); 9792 crc32(in_out3, Address(in_out2, 0), 8); 9793 crc32(tmp1, Address(in_out2, size), 8); 9794 crc32(tmp2, Address(in_out2, size * 2), 8); 9795 addq(in_out2, 8); 9796 cmpq(in_out2, tmp3); 9797 jcc(Assembler::less, L_processPartition); 9798 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 9799 w_xtmp1, w_xtmp2, w_xtmp3, 9800 tmp4, tmp5, 9801 n_tmp6); 9802 addq(in_out2, 2 * size); 9803 subl(in_out1, 3 * size); 9804 jmp(L_processPartitions); 9805 9806 bind(L_exit); 9807 } 9808 #else 9809 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, 9810 Register tmp1, Register tmp2, Register tmp3, 9811 XMMRegister xtmp1, XMMRegister xtmp2) { 9812 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 9813 if (n > 0) { 9814 addl(tmp3, n * 256 * 8); 9815 } 9816 // Q1 = TABLEExt[n][B & 0xFF]; 9817 movl(tmp1, in_out); 9818 andl(tmp1, 0x000000FF); 9819 shll(tmp1, 3); 9820 addl(tmp1, tmp3); 9821 movq(xtmp1, Address(tmp1, 0)); 9822 9823 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 9824 movl(tmp2, in_out); 9825 shrl(tmp2, 8); 9826 andl(tmp2, 0x000000FF); 9827 shll(tmp2, 3); 9828 addl(tmp2, tmp3); 9829 movq(xtmp2, Address(tmp2, 0)); 9830 9831 psllq(xtmp2, 8); 9832 pxor(xtmp1, xtmp2); 9833 9834 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 9835 movl(tmp2, in_out); 9836 shrl(tmp2, 16); 9837 andl(tmp2, 0x000000FF); 9838 shll(tmp2, 3); 9839 addl(tmp2, tmp3); 9840 movq(xtmp2, Address(tmp2, 0)); 9841 9842 psllq(xtmp2, 16); 9843 pxor(xtmp1, xtmp2); 9844 9845 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 9846 shrl(in_out, 24); 9847 andl(in_out, 0x000000FF); 9848 shll(in_out, 3); 9849 addl(in_out, tmp3); 9850 movq(xtmp2, Address(in_out, 0)); 9851 9852 psllq(xtmp2, 24); 9853 pxor(xtmp1, xtmp2); // Result in CXMM 9854 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 9855 } 9856 9857 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 9858 Register in_out, 9859 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 9860 XMMRegister w_xtmp2, 9861 Register tmp1, 9862 Register n_tmp2, Register n_tmp3) { 9863 if (is_pclmulqdq_supported) { 9864 movdl(w_xtmp1, in_out); 9865 9866 movl(tmp1, const_or_pre_comp_const_index); 9867 movdl(w_xtmp2, tmp1); 9868 pclmulqdq(w_xtmp1, w_xtmp2, 0); 9869 // Keep result in XMM since GPR is 32 bit in length 9870 } else { 9871 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); 9872 } 9873 } 9874 9875 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 9876 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9877 Register tmp1, Register tmp2, 9878 Register n_tmp3) { 9879 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 9880 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 9881 9882 psllq(w_xtmp1, 1); 9883 movdl(tmp1, w_xtmp1); 9884 psrlq(w_xtmp1, 32); 9885 movdl(in_out, w_xtmp1); 9886 9887 xorl(tmp2, tmp2); 9888 crc32(tmp2, tmp1, 4); 9889 xorl(in_out, tmp2); 9890 9891 psllq(w_xtmp2, 1); 9892 movdl(tmp1, w_xtmp2); 9893 psrlq(w_xtmp2, 32); 9894 movdl(in1, w_xtmp2); 9895 9896 xorl(tmp2, tmp2); 9897 crc32(tmp2, tmp1, 4); 9898 xorl(in1, tmp2); 9899 xorl(in_out, in1); 9900 xorl(in_out, in2); 9901 } 9902 9903 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 9904 Register in_out1, Register in_out2, Register in_out3, 9905 Register tmp1, Register tmp2, Register tmp3, 9906 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9907 Register tmp4, Register tmp5, 9908 Register n_tmp6) { 9909 Label L_processPartitions; 9910 Label L_processPartition; 9911 Label L_exit; 9912 9913 bind(L_processPartitions); 9914 cmpl(in_out1, 3 * size); 9915 jcc(Assembler::less, L_exit); 9916 xorl(tmp1, tmp1); 9917 xorl(tmp2, tmp2); 9918 movl(tmp3, in_out2); 9919 addl(tmp3, size); 9920 9921 bind(L_processPartition); 9922 crc32(in_out3, Address(in_out2, 0), 4); 9923 crc32(tmp1, Address(in_out2, size), 4); 9924 crc32(tmp2, Address(in_out2, size*2), 4); 9925 crc32(in_out3, Address(in_out2, 0+4), 4); 9926 crc32(tmp1, Address(in_out2, size+4), 4); 9927 crc32(tmp2, Address(in_out2, size*2+4), 4); 9928 addl(in_out2, 8); 9929 cmpl(in_out2, tmp3); 9930 jcc(Assembler::less, L_processPartition); 9931 9932 push(tmp3); 9933 push(in_out1); 9934 push(in_out2); 9935 tmp4 = tmp3; 9936 tmp5 = in_out1; 9937 n_tmp6 = in_out2; 9938 9939 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 9940 w_xtmp1, w_xtmp2, w_xtmp3, 9941 tmp4, tmp5, 9942 n_tmp6); 9943 9944 pop(in_out2); 9945 pop(in_out1); 9946 pop(tmp3); 9947 9948 addl(in_out2, 2 * size); 9949 subl(in_out1, 3 * size); 9950 jmp(L_processPartitions); 9951 9952 bind(L_exit); 9953 } 9954 #endif //LP64 9955 9956 #ifdef _LP64 9957 // Algorithm 2: Pipelined usage of the CRC32 instruction. 9958 // Input: A buffer I of L bytes. 9959 // Output: the CRC32C value of the buffer. 9960 // Notations: 9961 // Write L = 24N + r, with N = floor (L/24). 9962 // r = L mod 24 (0 <= r < 24). 9963 // Consider I as the concatenation of A|B|C|R, where A, B, C, each, 9964 // N quadwords, and R consists of r bytes. 9965 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 9966 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 9967 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 9968 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 9969 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 9970 Register tmp1, Register tmp2, Register tmp3, 9971 Register tmp4, Register tmp5, Register tmp6, 9972 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9973 bool is_pclmulqdq_supported) { 9974 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 9975 Label L_wordByWord; 9976 Label L_byteByByteProlog; 9977 Label L_byteByByte; 9978 Label L_exit; 9979 9980 if (is_pclmulqdq_supported ) { 9981 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr(); 9982 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1); 9983 9984 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2); 9985 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3); 9986 9987 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4); 9988 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5); 9989 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); 9990 } else { 9991 const_or_pre_comp_const_index[0] = 1; 9992 const_or_pre_comp_const_index[1] = 0; 9993 9994 const_or_pre_comp_const_index[2] = 3; 9995 const_or_pre_comp_const_index[3] = 2; 9996 9997 const_or_pre_comp_const_index[4] = 5; 9998 const_or_pre_comp_const_index[5] = 4; 9999 } 10000 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 10001 in2, in1, in_out, 10002 tmp1, tmp2, tmp3, 10003 w_xtmp1, w_xtmp2, w_xtmp3, 10004 tmp4, tmp5, 10005 tmp6); 10006 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 10007 in2, in1, in_out, 10008 tmp1, tmp2, tmp3, 10009 w_xtmp1, w_xtmp2, w_xtmp3, 10010 tmp4, tmp5, 10011 tmp6); 10012 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 10013 in2, in1, in_out, 10014 tmp1, tmp2, tmp3, 10015 w_xtmp1, w_xtmp2, w_xtmp3, 10016 tmp4, tmp5, 10017 tmp6); 10018 movl(tmp1, in2); 10019 andl(tmp1, 0x00000007); 10020 negl(tmp1); 10021 addl(tmp1, in2); 10022 addq(tmp1, in1); 10023 10024 cmpq(in1, tmp1); 10025 jccb(Assembler::greaterEqual, L_byteByByteProlog); 10026 align(16); 10027 BIND(L_wordByWord); 10028 crc32(in_out, Address(in1, 0), 8); 10029 addq(in1, 8); 10030 cmpq(in1, tmp1); 10031 jcc(Assembler::less, L_wordByWord); 10032 10033 BIND(L_byteByByteProlog); 10034 andl(in2, 0x00000007); 10035 movl(tmp2, 1); 10036 10037 cmpl(tmp2, in2); 10038 jccb(Assembler::greater, L_exit); 10039 BIND(L_byteByByte); 10040 crc32(in_out, Address(in1, 0), 1); 10041 incq(in1); 10042 incl(tmp2); 10043 cmpl(tmp2, in2); 10044 jcc(Assembler::lessEqual, L_byteByByte); 10045 10046 BIND(L_exit); 10047 } 10048 #else 10049 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 10050 Register tmp1, Register tmp2, Register tmp3, 10051 Register tmp4, Register tmp5, Register tmp6, 10052 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 10053 bool is_pclmulqdq_supported) { 10054 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 10055 Label L_wordByWord; 10056 Label L_byteByByteProlog; 10057 Label L_byteByByte; 10058 Label L_exit; 10059 10060 if (is_pclmulqdq_supported) { 10061 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr(); 10062 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1); 10063 10064 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2); 10065 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3); 10066 10067 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4); 10068 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5); 10069 } else { 10070 const_or_pre_comp_const_index[0] = 1; 10071 const_or_pre_comp_const_index[1] = 0; 10072 10073 const_or_pre_comp_const_index[2] = 3; 10074 const_or_pre_comp_const_index[3] = 2; 10075 10076 const_or_pre_comp_const_index[4] = 5; 10077 const_or_pre_comp_const_index[5] = 4; 10078 } 10079 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 10080 in2, in1, in_out, 10081 tmp1, tmp2, tmp3, 10082 w_xtmp1, w_xtmp2, w_xtmp3, 10083 tmp4, tmp5, 10084 tmp6); 10085 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 10086 in2, in1, in_out, 10087 tmp1, tmp2, tmp3, 10088 w_xtmp1, w_xtmp2, w_xtmp3, 10089 tmp4, tmp5, 10090 tmp6); 10091 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 10092 in2, in1, in_out, 10093 tmp1, tmp2, tmp3, 10094 w_xtmp1, w_xtmp2, w_xtmp3, 10095 tmp4, tmp5, 10096 tmp6); 10097 movl(tmp1, in2); 10098 andl(tmp1, 0x00000007); 10099 negl(tmp1); 10100 addl(tmp1, in2); 10101 addl(tmp1, in1); 10102 10103 BIND(L_wordByWord); 10104 cmpl(in1, tmp1); 10105 jcc(Assembler::greaterEqual, L_byteByByteProlog); 10106 crc32(in_out, Address(in1,0), 4); 10107 addl(in1, 4); 10108 jmp(L_wordByWord); 10109 10110 BIND(L_byteByByteProlog); 10111 andl(in2, 0x00000007); 10112 movl(tmp2, 1); 10113 10114 BIND(L_byteByByte); 10115 cmpl(tmp2, in2); 10116 jccb(Assembler::greater, L_exit); 10117 movb(tmp1, Address(in1, 0)); 10118 crc32(in_out, tmp1, 1); 10119 incl(in1); 10120 incl(tmp2); 10121 jmp(L_byteByByte); 10122 10123 BIND(L_exit); 10124 } 10125 #endif // LP64 10126 #undef BIND 10127 #undef BLOCK_COMMENT 10128 10129 // Compress char[] array to byte[]. 10130 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 10131 // Return the array length if every element in array can be encoded, 10132 // otherwise, the index of first non-latin1 (> 0xff) character. 10133 // @IntrinsicCandidate 10134 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { 10135 // for (int i = 0; i < len; i++) { 10136 // char c = src[srcOff]; 10137 // if (c > 0xff) { 10138 // return i; // return index of non-latin1 char 10139 // } 10140 // dst[dstOff] = (byte)c; 10141 // srcOff++; 10142 // dstOff++; 10143 // } 10144 // return len; 10145 // } 10146 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 10147 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 10148 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 10149 Register tmp5, Register result, KRegister mask1, KRegister mask2) { 10150 Label copy_chars_loop, done, reset_sp, copy_tail; 10151 10152 // rsi: src 10153 // rdi: dst 10154 // rdx: len 10155 // rcx: tmp5 10156 // rax: result 10157 10158 // rsi holds start addr of source char[] to be compressed 10159 // rdi holds start addr of destination byte[] 10160 // rdx holds length 10161 10162 assert(len != result, ""); 10163 10164 // save length for return 10165 movl(result, len); 10166 10167 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 10168 VM_Version::supports_avx512vlbw() && 10169 VM_Version::supports_bmi2()) { 10170 10171 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail; 10172 10173 // alignment 10174 Label post_alignment; 10175 10176 // if length of the string is less than 32, handle it the old fashioned way 10177 testl(len, -32); 10178 jcc(Assembler::zero, below_threshold); 10179 10180 // First check whether a character is compressible ( <= 0xFF). 10181 // Create mask to test for Unicode chars inside zmm vector 10182 movl(tmp5, 0x00FF); 10183 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit); 10184 10185 testl(len, -64); 10186 jccb(Assembler::zero, post_alignment); 10187 10188 movl(tmp5, dst); 10189 andl(tmp5, (32 - 1)); 10190 negl(tmp5); 10191 andl(tmp5, (32 - 1)); 10192 10193 // bail out when there is nothing to be done 10194 testl(tmp5, 0xFFFFFFFF); 10195 jccb(Assembler::zero, post_alignment); 10196 10197 // ~(~0 << len), where len is the # of remaining elements to process 10198 movl(len, 0xFFFFFFFF); 10199 shlxl(len, len, tmp5); 10200 notl(len); 10201 kmovdl(mask2, len); 10202 movl(len, result); 10203 10204 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 10205 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 10206 ktestd(mask1, mask2); 10207 jcc(Assembler::carryClear, copy_tail); 10208 10209 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 10210 10211 addptr(src, tmp5); 10212 addptr(src, tmp5); 10213 addptr(dst, tmp5); 10214 subl(len, tmp5); 10215 10216 bind(post_alignment); 10217 // end of alignment 10218 10219 movl(tmp5, len); 10220 andl(tmp5, (32 - 1)); // tail count (in chars) 10221 andl(len, ~(32 - 1)); // vector count (in chars) 10222 jccb(Assembler::zero, copy_loop_tail); 10223 10224 lea(src, Address(src, len, Address::times_2)); 10225 lea(dst, Address(dst, len, Address::times_1)); 10226 negptr(len); 10227 10228 bind(copy_32_loop); 10229 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); 10230 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 10231 kortestdl(mask1, mask1); 10232 jccb(Assembler::carryClear, reset_for_copy_tail); 10233 10234 // All elements in current processed chunk are valid candidates for 10235 // compression. Write a truncated byte elements to the memory. 10236 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); 10237 addptr(len, 32); 10238 jccb(Assembler::notZero, copy_32_loop); 10239 10240 bind(copy_loop_tail); 10241 // bail out when there is nothing to be done 10242 testl(tmp5, 0xFFFFFFFF); 10243 jcc(Assembler::zero, done); 10244 10245 movl(len, tmp5); 10246 10247 // ~(~0 << len), where len is the # of remaining elements to process 10248 movl(tmp5, 0xFFFFFFFF); 10249 shlxl(tmp5, tmp5, len); 10250 notl(tmp5); 10251 10252 kmovdl(mask2, tmp5); 10253 10254 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 10255 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 10256 ktestd(mask1, mask2); 10257 jcc(Assembler::carryClear, copy_tail); 10258 10259 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 10260 jmp(done); 10261 10262 bind(reset_for_copy_tail); 10263 lea(src, Address(src, tmp5, Address::times_2)); 10264 lea(dst, Address(dst, tmp5, Address::times_1)); 10265 subptr(len, tmp5); 10266 jmp(copy_chars_loop); 10267 10268 bind(below_threshold); 10269 } 10270 10271 if (UseSSE42Intrinsics) { 10272 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail; 10273 10274 // vectored compression 10275 testl(len, 0xfffffff8); 10276 jcc(Assembler::zero, copy_tail); 10277 10278 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors 10279 movdl(tmp1Reg, tmp5); 10280 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 10281 10282 andl(len, 0xfffffff0); 10283 jccb(Assembler::zero, copy_16); 10284 10285 // compress 16 chars per iter 10286 pxor(tmp4Reg, tmp4Reg); 10287 10288 lea(src, Address(src, len, Address::times_2)); 10289 lea(dst, Address(dst, len, Address::times_1)); 10290 negptr(len); 10291 10292 bind(copy_32_loop); 10293 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters 10294 por(tmp4Reg, tmp2Reg); 10295 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters 10296 por(tmp4Reg, tmp3Reg); 10297 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector 10298 jccb(Assembler::notZero, reset_for_copy_tail); 10299 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte 10300 movdqu(Address(dst, len, Address::times_1), tmp2Reg); 10301 addptr(len, 16); 10302 jccb(Assembler::notZero, copy_32_loop); 10303 10304 // compress next vector of 8 chars (if any) 10305 bind(copy_16); 10306 // len = 0 10307 testl(result, 0x00000008); // check if there's a block of 8 chars to compress 10308 jccb(Assembler::zero, copy_tail_sse); 10309 10310 pxor(tmp3Reg, tmp3Reg); 10311 10312 movdqu(tmp2Reg, Address(src, 0)); 10313 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 10314 jccb(Assembler::notZero, reset_for_copy_tail); 10315 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte 10316 movq(Address(dst, 0), tmp2Reg); 10317 addptr(src, 16); 10318 addptr(dst, 8); 10319 jmpb(copy_tail_sse); 10320 10321 bind(reset_for_copy_tail); 10322 movl(tmp5, result); 10323 andl(tmp5, 0x0000000f); 10324 lea(src, Address(src, tmp5, Address::times_2)); 10325 lea(dst, Address(dst, tmp5, Address::times_1)); 10326 subptr(len, tmp5); 10327 jmpb(copy_chars_loop); 10328 10329 bind(copy_tail_sse); 10330 movl(len, result); 10331 andl(len, 0x00000007); // tail count (in chars) 10332 } 10333 // compress 1 char per iter 10334 bind(copy_tail); 10335 testl(len, len); 10336 jccb(Assembler::zero, done); 10337 lea(src, Address(src, len, Address::times_2)); 10338 lea(dst, Address(dst, len, Address::times_1)); 10339 negptr(len); 10340 10341 bind(copy_chars_loop); 10342 load_unsigned_short(tmp5, Address(src, len, Address::times_2)); 10343 testl(tmp5, 0xff00); // check if Unicode char 10344 jccb(Assembler::notZero, reset_sp); 10345 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte 10346 increment(len); 10347 jccb(Assembler::notZero, copy_chars_loop); 10348 10349 // add len then return (len will be zero if compress succeeded, otherwise negative) 10350 bind(reset_sp); 10351 addl(result, len); 10352 10353 bind(done); 10354 } 10355 10356 // Inflate byte[] array to char[]. 10357 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java 10358 // @IntrinsicCandidate 10359 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { 10360 // for (int i = 0; i < len; i++) { 10361 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); 10362 // } 10363 // } 10364 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 10365 XMMRegister tmp1, Register tmp2, KRegister mask) { 10366 Label copy_chars_loop, done, below_threshold, avx3_threshold; 10367 // rsi: src 10368 // rdi: dst 10369 // rdx: len 10370 // rcx: tmp2 10371 10372 // rsi holds start addr of source byte[] to be inflated 10373 // rdi holds start addr of destination char[] 10374 // rdx holds length 10375 assert_different_registers(src, dst, len, tmp2); 10376 movl(tmp2, len); 10377 if ((UseAVX > 2) && // AVX512 10378 VM_Version::supports_avx512vlbw() && 10379 VM_Version::supports_bmi2()) { 10380 10381 Label copy_32_loop, copy_tail; 10382 Register tmp3_aliased = len; 10383 10384 // if length of the string is less than 16, handle it in an old fashioned way 10385 testl(len, -16); 10386 jcc(Assembler::zero, below_threshold); 10387 10388 testl(len, -1 * AVX3Threshold); 10389 jcc(Assembler::zero, avx3_threshold); 10390 10391 // In order to use only one arithmetic operation for the main loop we use 10392 // this pre-calculation 10393 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop 10394 andl(len, -32); // vector count 10395 jccb(Assembler::zero, copy_tail); 10396 10397 lea(src, Address(src, len, Address::times_1)); 10398 lea(dst, Address(dst, len, Address::times_2)); 10399 negptr(len); 10400 10401 10402 // inflate 32 chars per iter 10403 bind(copy_32_loop); 10404 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); 10405 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); 10406 addptr(len, 32); 10407 jcc(Assembler::notZero, copy_32_loop); 10408 10409 bind(copy_tail); 10410 // bail out when there is nothing to be done 10411 testl(tmp2, -1); // we don't destroy the contents of tmp2 here 10412 jcc(Assembler::zero, done); 10413 10414 // ~(~0 << length), where length is the # of remaining elements to process 10415 movl(tmp3_aliased, -1); 10416 shlxl(tmp3_aliased, tmp3_aliased, tmp2); 10417 notl(tmp3_aliased); 10418 kmovdl(mask, tmp3_aliased); 10419 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit); 10420 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit); 10421 10422 jmp(done); 10423 bind(avx3_threshold); 10424 } 10425 if (UseSSE42Intrinsics) { 10426 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; 10427 10428 if (UseAVX > 1) { 10429 andl(tmp2, (16 - 1)); 10430 andl(len, -16); 10431 jccb(Assembler::zero, copy_new_tail); 10432 } else { 10433 andl(tmp2, 0x00000007); // tail count (in chars) 10434 andl(len, 0xfffffff8); // vector count (in chars) 10435 jccb(Assembler::zero, copy_tail); 10436 } 10437 10438 // vectored inflation 10439 lea(src, Address(src, len, Address::times_1)); 10440 lea(dst, Address(dst, len, Address::times_2)); 10441 negptr(len); 10442 10443 if (UseAVX > 1) { 10444 bind(copy_16_loop); 10445 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); 10446 vmovdqu(Address(dst, len, Address::times_2), tmp1); 10447 addptr(len, 16); 10448 jcc(Assembler::notZero, copy_16_loop); 10449 10450 bind(below_threshold); 10451 bind(copy_new_tail); 10452 movl(len, tmp2); 10453 andl(tmp2, 0x00000007); 10454 andl(len, 0xFFFFFFF8); 10455 jccb(Assembler::zero, copy_tail); 10456 10457 pmovzxbw(tmp1, Address(src, 0)); 10458 movdqu(Address(dst, 0), tmp1); 10459 addptr(src, 8); 10460 addptr(dst, 2 * 8); 10461 10462 jmp(copy_tail, true); 10463 } 10464 10465 // inflate 8 chars per iter 10466 bind(copy_8_loop); 10467 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words 10468 movdqu(Address(dst, len, Address::times_2), tmp1); 10469 addptr(len, 8); 10470 jcc(Assembler::notZero, copy_8_loop); 10471 10472 bind(copy_tail); 10473 movl(len, tmp2); 10474 10475 cmpl(len, 4); 10476 jccb(Assembler::less, copy_bytes); 10477 10478 movdl(tmp1, Address(src, 0)); // load 4 byte chars 10479 pmovzxbw(tmp1, tmp1); 10480 movq(Address(dst, 0), tmp1); 10481 subptr(len, 4); 10482 addptr(src, 4); 10483 addptr(dst, 8); 10484 10485 bind(copy_bytes); 10486 } else { 10487 bind(below_threshold); 10488 } 10489 10490 testl(len, len); 10491 jccb(Assembler::zero, done); 10492 lea(src, Address(src, len, Address::times_1)); 10493 lea(dst, Address(dst, len, Address::times_2)); 10494 negptr(len); 10495 10496 // inflate 1 char per iter 10497 bind(copy_chars_loop); 10498 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char 10499 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word 10500 increment(len); 10501 jcc(Assembler::notZero, copy_chars_loop); 10502 10503 bind(done); 10504 } 10505 10506 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 10507 switch(type) { 10508 case T_BYTE: 10509 case T_BOOLEAN: 10510 evmovdqub(dst, kmask, src, merge, vector_len); 10511 break; 10512 case T_CHAR: 10513 case T_SHORT: 10514 evmovdquw(dst, kmask, src, merge, vector_len); 10515 break; 10516 case T_INT: 10517 case T_FLOAT: 10518 evmovdqul(dst, kmask, src, merge, vector_len); 10519 break; 10520 case T_LONG: 10521 case T_DOUBLE: 10522 evmovdquq(dst, kmask, src, merge, vector_len); 10523 break; 10524 default: 10525 fatal("Unexpected type argument %s", type2name(type)); 10526 break; 10527 } 10528 } 10529 10530 10531 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 10532 switch(type) { 10533 case T_BYTE: 10534 case T_BOOLEAN: 10535 evmovdqub(dst, kmask, src, merge, vector_len); 10536 break; 10537 case T_CHAR: 10538 case T_SHORT: 10539 evmovdquw(dst, kmask, src, merge, vector_len); 10540 break; 10541 case T_INT: 10542 case T_FLOAT: 10543 evmovdqul(dst, kmask, src, merge, vector_len); 10544 break; 10545 case T_LONG: 10546 case T_DOUBLE: 10547 evmovdquq(dst, kmask, src, merge, vector_len); 10548 break; 10549 default: 10550 fatal("Unexpected type argument %s", type2name(type)); 10551 break; 10552 } 10553 } 10554 10555 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 10556 switch(type) { 10557 case T_BYTE: 10558 case T_BOOLEAN: 10559 evmovdqub(dst, kmask, src, merge, vector_len); 10560 break; 10561 case T_CHAR: 10562 case T_SHORT: 10563 evmovdquw(dst, kmask, src, merge, vector_len); 10564 break; 10565 case T_INT: 10566 case T_FLOAT: 10567 evmovdqul(dst, kmask, src, merge, vector_len); 10568 break; 10569 case T_LONG: 10570 case T_DOUBLE: 10571 evmovdquq(dst, kmask, src, merge, vector_len); 10572 break; 10573 default: 10574 fatal("Unexpected type argument %s", type2name(type)); 10575 break; 10576 } 10577 } 10578 10579 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) { 10580 switch(masklen) { 10581 case 2: 10582 knotbl(dst, src); 10583 movl(rtmp, 3); 10584 kmovbl(ktmp, rtmp); 10585 kandbl(dst, ktmp, dst); 10586 break; 10587 case 4: 10588 knotbl(dst, src); 10589 movl(rtmp, 15); 10590 kmovbl(ktmp, rtmp); 10591 kandbl(dst, ktmp, dst); 10592 break; 10593 case 8: 10594 knotbl(dst, src); 10595 break; 10596 case 16: 10597 knotwl(dst, src); 10598 break; 10599 case 32: 10600 knotdl(dst, src); 10601 break; 10602 case 64: 10603 knotql(dst, src); 10604 break; 10605 default: 10606 fatal("Unexpected vector length %d", masklen); 10607 break; 10608 } 10609 } 10610 10611 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 10612 switch(type) { 10613 case T_BOOLEAN: 10614 case T_BYTE: 10615 kandbl(dst, src1, src2); 10616 break; 10617 case T_CHAR: 10618 case T_SHORT: 10619 kandwl(dst, src1, src2); 10620 break; 10621 case T_INT: 10622 case T_FLOAT: 10623 kanddl(dst, src1, src2); 10624 break; 10625 case T_LONG: 10626 case T_DOUBLE: 10627 kandql(dst, src1, src2); 10628 break; 10629 default: 10630 fatal("Unexpected type argument %s", type2name(type)); 10631 break; 10632 } 10633 } 10634 10635 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 10636 switch(type) { 10637 case T_BOOLEAN: 10638 case T_BYTE: 10639 korbl(dst, src1, src2); 10640 break; 10641 case T_CHAR: 10642 case T_SHORT: 10643 korwl(dst, src1, src2); 10644 break; 10645 case T_INT: 10646 case T_FLOAT: 10647 kordl(dst, src1, src2); 10648 break; 10649 case T_LONG: 10650 case T_DOUBLE: 10651 korql(dst, src1, src2); 10652 break; 10653 default: 10654 fatal("Unexpected type argument %s", type2name(type)); 10655 break; 10656 } 10657 } 10658 10659 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 10660 switch(type) { 10661 case T_BOOLEAN: 10662 case T_BYTE: 10663 kxorbl(dst, src1, src2); 10664 break; 10665 case T_CHAR: 10666 case T_SHORT: 10667 kxorwl(dst, src1, src2); 10668 break; 10669 case T_INT: 10670 case T_FLOAT: 10671 kxordl(dst, src1, src2); 10672 break; 10673 case T_LONG: 10674 case T_DOUBLE: 10675 kxorql(dst, src1, src2); 10676 break; 10677 default: 10678 fatal("Unexpected type argument %s", type2name(type)); 10679 break; 10680 } 10681 } 10682 10683 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10684 switch(type) { 10685 case T_BOOLEAN: 10686 case T_BYTE: 10687 evpermb(dst, mask, nds, src, merge, vector_len); break; 10688 case T_CHAR: 10689 case T_SHORT: 10690 evpermw(dst, mask, nds, src, merge, vector_len); break; 10691 case T_INT: 10692 case T_FLOAT: 10693 evpermd(dst, mask, nds, src, merge, vector_len); break; 10694 case T_LONG: 10695 case T_DOUBLE: 10696 evpermq(dst, mask, nds, src, merge, vector_len); break; 10697 default: 10698 fatal("Unexpected type argument %s", type2name(type)); break; 10699 } 10700 } 10701 10702 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10703 switch(type) { 10704 case T_BOOLEAN: 10705 case T_BYTE: 10706 evpermb(dst, mask, nds, src, merge, vector_len); break; 10707 case T_CHAR: 10708 case T_SHORT: 10709 evpermw(dst, mask, nds, src, merge, vector_len); break; 10710 case T_INT: 10711 case T_FLOAT: 10712 evpermd(dst, mask, nds, src, merge, vector_len); break; 10713 case T_LONG: 10714 case T_DOUBLE: 10715 evpermq(dst, mask, nds, src, merge, vector_len); break; 10716 default: 10717 fatal("Unexpected type argument %s", type2name(type)); break; 10718 } 10719 } 10720 10721 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10722 switch(type) { 10723 case T_BYTE: 10724 evpminub(dst, mask, nds, src, merge, vector_len); break; 10725 case T_SHORT: 10726 evpminuw(dst, mask, nds, src, merge, vector_len); break; 10727 case T_INT: 10728 evpminud(dst, mask, nds, src, merge, vector_len); break; 10729 case T_LONG: 10730 evpminuq(dst, mask, nds, src, merge, vector_len); break; 10731 default: 10732 fatal("Unexpected type argument %s", type2name(type)); break; 10733 } 10734 } 10735 10736 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10737 switch(type) { 10738 case T_BYTE: 10739 evpmaxub(dst, mask, nds, src, merge, vector_len); break; 10740 case T_SHORT: 10741 evpmaxuw(dst, mask, nds, src, merge, vector_len); break; 10742 case T_INT: 10743 evpmaxud(dst, mask, nds, src, merge, vector_len); break; 10744 case T_LONG: 10745 evpmaxuq(dst, mask, nds, src, merge, vector_len); break; 10746 default: 10747 fatal("Unexpected type argument %s", type2name(type)); break; 10748 } 10749 } 10750 10751 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10752 switch(type) { 10753 case T_BYTE: 10754 evpminub(dst, mask, nds, src, merge, vector_len); break; 10755 case T_SHORT: 10756 evpminuw(dst, mask, nds, src, merge, vector_len); break; 10757 case T_INT: 10758 evpminud(dst, mask, nds, src, merge, vector_len); break; 10759 case T_LONG: 10760 evpminuq(dst, mask, nds, src, merge, vector_len); break; 10761 default: 10762 fatal("Unexpected type argument %s", type2name(type)); break; 10763 } 10764 } 10765 10766 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10767 switch(type) { 10768 case T_BYTE: 10769 evpmaxub(dst, mask, nds, src, merge, vector_len); break; 10770 case T_SHORT: 10771 evpmaxuw(dst, mask, nds, src, merge, vector_len); break; 10772 case T_INT: 10773 evpmaxud(dst, mask, nds, src, merge, vector_len); break; 10774 case T_LONG: 10775 evpmaxuq(dst, mask, nds, src, merge, vector_len); break; 10776 default: 10777 fatal("Unexpected type argument %s", type2name(type)); break; 10778 } 10779 } 10780 10781 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10782 switch(type) { 10783 case T_BYTE: 10784 evpminsb(dst, mask, nds, src, merge, vector_len); break; 10785 case T_SHORT: 10786 evpminsw(dst, mask, nds, src, merge, vector_len); break; 10787 case T_INT: 10788 evpminsd(dst, mask, nds, src, merge, vector_len); break; 10789 case T_LONG: 10790 evpminsq(dst, mask, nds, src, merge, vector_len); break; 10791 default: 10792 fatal("Unexpected type argument %s", type2name(type)); break; 10793 } 10794 } 10795 10796 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10797 switch(type) { 10798 case T_BYTE: 10799 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 10800 case T_SHORT: 10801 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 10802 case T_INT: 10803 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 10804 case T_LONG: 10805 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 10806 default: 10807 fatal("Unexpected type argument %s", type2name(type)); break; 10808 } 10809 } 10810 10811 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10812 switch(type) { 10813 case T_BYTE: 10814 evpminsb(dst, mask, nds, src, merge, vector_len); break; 10815 case T_SHORT: 10816 evpminsw(dst, mask, nds, src, merge, vector_len); break; 10817 case T_INT: 10818 evpminsd(dst, mask, nds, src, merge, vector_len); break; 10819 case T_LONG: 10820 evpminsq(dst, mask, nds, src, merge, vector_len); break; 10821 default: 10822 fatal("Unexpected type argument %s", type2name(type)); break; 10823 } 10824 } 10825 10826 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10827 switch(type) { 10828 case T_BYTE: 10829 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 10830 case T_SHORT: 10831 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 10832 case T_INT: 10833 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 10834 case T_LONG: 10835 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 10836 default: 10837 fatal("Unexpected type argument %s", type2name(type)); break; 10838 } 10839 } 10840 10841 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10842 switch(type) { 10843 case T_INT: 10844 evpxord(dst, mask, nds, src, merge, vector_len); break; 10845 case T_LONG: 10846 evpxorq(dst, mask, nds, src, merge, vector_len); break; 10847 default: 10848 fatal("Unexpected type argument %s", type2name(type)); break; 10849 } 10850 } 10851 10852 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10853 switch(type) { 10854 case T_INT: 10855 evpxord(dst, mask, nds, src, merge, vector_len); break; 10856 case T_LONG: 10857 evpxorq(dst, mask, nds, src, merge, vector_len); break; 10858 default: 10859 fatal("Unexpected type argument %s", type2name(type)); break; 10860 } 10861 } 10862 10863 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10864 switch(type) { 10865 case T_INT: 10866 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 10867 case T_LONG: 10868 evporq(dst, mask, nds, src, merge, vector_len); break; 10869 default: 10870 fatal("Unexpected type argument %s", type2name(type)); break; 10871 } 10872 } 10873 10874 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10875 switch(type) { 10876 case T_INT: 10877 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 10878 case T_LONG: 10879 evporq(dst, mask, nds, src, merge, vector_len); break; 10880 default: 10881 fatal("Unexpected type argument %s", type2name(type)); break; 10882 } 10883 } 10884 10885 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10886 switch(type) { 10887 case T_INT: 10888 evpandd(dst, mask, nds, src, merge, vector_len); break; 10889 case T_LONG: 10890 evpandq(dst, mask, nds, src, merge, vector_len); break; 10891 default: 10892 fatal("Unexpected type argument %s", type2name(type)); break; 10893 } 10894 } 10895 10896 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10897 switch(type) { 10898 case T_INT: 10899 evpandd(dst, mask, nds, src, merge, vector_len); break; 10900 case T_LONG: 10901 evpandq(dst, mask, nds, src, merge, vector_len); break; 10902 default: 10903 fatal("Unexpected type argument %s", type2name(type)); break; 10904 } 10905 } 10906 10907 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) { 10908 switch(masklen) { 10909 case 8: 10910 kortestbl(src1, src2); 10911 break; 10912 case 16: 10913 kortestwl(src1, src2); 10914 break; 10915 case 32: 10916 kortestdl(src1, src2); 10917 break; 10918 case 64: 10919 kortestql(src1, src2); 10920 break; 10921 default: 10922 fatal("Unexpected mask length %d", masklen); 10923 break; 10924 } 10925 } 10926 10927 10928 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) { 10929 switch(masklen) { 10930 case 8: 10931 ktestbl(src1, src2); 10932 break; 10933 case 16: 10934 ktestwl(src1, src2); 10935 break; 10936 case 32: 10937 ktestdl(src1, src2); 10938 break; 10939 case 64: 10940 ktestql(src1, src2); 10941 break; 10942 default: 10943 fatal("Unexpected mask length %d", masklen); 10944 break; 10945 } 10946 } 10947 10948 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 10949 switch(type) { 10950 case T_INT: 10951 evprold(dst, mask, src, shift, merge, vlen_enc); break; 10952 case T_LONG: 10953 evprolq(dst, mask, src, shift, merge, vlen_enc); break; 10954 default: 10955 fatal("Unexpected type argument %s", type2name(type)); break; 10956 break; 10957 } 10958 } 10959 10960 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 10961 switch(type) { 10962 case T_INT: 10963 evprord(dst, mask, src, shift, merge, vlen_enc); break; 10964 case T_LONG: 10965 evprorq(dst, mask, src, shift, merge, vlen_enc); break; 10966 default: 10967 fatal("Unexpected type argument %s", type2name(type)); break; 10968 } 10969 } 10970 10971 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 10972 switch(type) { 10973 case T_INT: 10974 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break; 10975 case T_LONG: 10976 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break; 10977 default: 10978 fatal("Unexpected type argument %s", type2name(type)); break; 10979 } 10980 } 10981 10982 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 10983 switch(type) { 10984 case T_INT: 10985 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break; 10986 case T_LONG: 10987 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break; 10988 default: 10989 fatal("Unexpected type argument %s", type2name(type)); break; 10990 } 10991 } 10992 10993 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 10994 assert(rscratch != noreg || always_reachable(src), "missing"); 10995 10996 if (reachable(src)) { 10997 evpandq(dst, nds, as_Address(src), vector_len); 10998 } else { 10999 lea(rscratch, src); 11000 evpandq(dst, nds, Address(rscratch, 0), vector_len); 11001 } 11002 } 11003 11004 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 11005 assert(rscratch != noreg || always_reachable(src), "missing"); 11006 11007 if (reachable(src)) { 11008 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len); 11009 } else { 11010 lea(rscratch, src); 11011 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len); 11012 } 11013 } 11014 11015 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 11016 assert(rscratch != noreg || always_reachable(src), "missing"); 11017 11018 if (reachable(src)) { 11019 evporq(dst, nds, as_Address(src), vector_len); 11020 } else { 11021 lea(rscratch, src); 11022 evporq(dst, nds, Address(rscratch, 0), vector_len); 11023 } 11024 } 11025 11026 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 11027 assert(rscratch != noreg || always_reachable(src), "missing"); 11028 11029 if (reachable(src)) { 11030 vpshufb(dst, nds, as_Address(src), vector_len); 11031 } else { 11032 lea(rscratch, src); 11033 vpshufb(dst, nds, Address(rscratch, 0), vector_len); 11034 } 11035 } 11036 11037 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 11038 assert(rscratch != noreg || always_reachable(src), "missing"); 11039 11040 if (reachable(src)) { 11041 Assembler::vpor(dst, nds, as_Address(src), vector_len); 11042 } else { 11043 lea(rscratch, src); 11044 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len); 11045 } 11046 } 11047 11048 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) { 11049 assert(rscratch != noreg || always_reachable(src3), "missing"); 11050 11051 if (reachable(src3)) { 11052 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len); 11053 } else { 11054 lea(rscratch, src3); 11055 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len); 11056 } 11057 } 11058 11059 #if COMPILER2_OR_JVMCI 11060 11061 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask, 11062 Register length, Register temp, int vec_enc) { 11063 // Computing mask for predicated vector store. 11064 movptr(temp, -1); 11065 bzhiq(temp, temp, length); 11066 kmov(mask, temp); 11067 evmovdqu(bt, mask, dst, xmm, true, vec_enc); 11068 } 11069 11070 // Set memory operation for length "less than" 64 bytes. 11071 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp, 11072 XMMRegister xmm, KRegister mask, Register length, 11073 Register temp, bool use64byteVector) { 11074 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 11075 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 11076 if (!use64byteVector) { 11077 fill32(dst, disp, xmm); 11078 subptr(length, 32 >> shift); 11079 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp); 11080 } else { 11081 assert(MaxVectorSize == 64, "vector length != 64"); 11082 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit); 11083 } 11084 } 11085 11086 11087 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp, 11088 XMMRegister xmm, KRegister mask, Register length, 11089 Register temp) { 11090 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 11091 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 11092 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit); 11093 } 11094 11095 11096 void MacroAssembler::fill32(Address dst, XMMRegister xmm) { 11097 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 11098 vmovdqu(dst, xmm); 11099 } 11100 11101 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) { 11102 fill32(Address(dst, disp), xmm); 11103 } 11104 11105 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) { 11106 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 11107 if (!use64byteVector) { 11108 fill32(dst, xmm); 11109 fill32(dst.plus_disp(32), xmm); 11110 } else { 11111 evmovdquq(dst, xmm, Assembler::AVX_512bit); 11112 } 11113 } 11114 11115 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) { 11116 fill64(Address(dst, disp), xmm, use64byteVector); 11117 } 11118 11119 #ifdef _LP64 11120 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value, 11121 Register count, Register rtmp, XMMRegister xtmp) { 11122 Label L_exit; 11123 Label L_fill_start; 11124 Label L_fill_64_bytes; 11125 Label L_fill_96_bytes; 11126 Label L_fill_128_bytes; 11127 Label L_fill_128_bytes_loop; 11128 Label L_fill_128_loop_header; 11129 Label L_fill_128_bytes_loop_header; 11130 Label L_fill_128_bytes_loop_pre_header; 11131 Label L_fill_zmm_sequence; 11132 11133 int shift = -1; 11134 int avx3threshold = VM_Version::avx3_threshold(); 11135 switch(type) { 11136 case T_BYTE: shift = 0; 11137 break; 11138 case T_SHORT: shift = 1; 11139 break; 11140 case T_INT: shift = 2; 11141 break; 11142 /* Uncomment when LONG fill stubs are supported. 11143 case T_LONG: shift = 3; 11144 break; 11145 */ 11146 default: 11147 fatal("Unhandled type: %s\n", type2name(type)); 11148 } 11149 11150 if ((avx3threshold != 0) || (MaxVectorSize == 32)) { 11151 11152 if (MaxVectorSize == 64) { 11153 cmpq(count, avx3threshold >> shift); 11154 jcc(Assembler::greater, L_fill_zmm_sequence); 11155 } 11156 11157 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit); 11158 11159 bind(L_fill_start); 11160 11161 cmpq(count, 32 >> shift); 11162 jccb(Assembler::greater, L_fill_64_bytes); 11163 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp); 11164 jmp(L_exit); 11165 11166 bind(L_fill_64_bytes); 11167 cmpq(count, 64 >> shift); 11168 jccb(Assembler::greater, L_fill_96_bytes); 11169 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp); 11170 jmp(L_exit); 11171 11172 bind(L_fill_96_bytes); 11173 cmpq(count, 96 >> shift); 11174 jccb(Assembler::greater, L_fill_128_bytes); 11175 fill64(to, 0, xtmp); 11176 subq(count, 64 >> shift); 11177 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp); 11178 jmp(L_exit); 11179 11180 bind(L_fill_128_bytes); 11181 cmpq(count, 128 >> shift); 11182 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); 11183 fill64(to, 0, xtmp); 11184 fill32(to, 64, xtmp); 11185 subq(count, 96 >> shift); 11186 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp); 11187 jmp(L_exit); 11188 11189 bind(L_fill_128_bytes_loop_pre_header); 11190 { 11191 mov(rtmp, to); 11192 andq(rtmp, 31); 11193 jccb(Assembler::zero, L_fill_128_bytes_loop_header); 11194 negq(rtmp); 11195 addq(rtmp, 32); 11196 mov64(r8, -1L); 11197 bzhiq(r8, r8, rtmp); 11198 kmovql(k2, r8); 11199 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit); 11200 addq(to, rtmp); 11201 shrq(rtmp, shift); 11202 subq(count, rtmp); 11203 } 11204 11205 cmpq(count, 128 >> shift); 11206 jcc(Assembler::less, L_fill_start); 11207 11208 bind(L_fill_128_bytes_loop_header); 11209 subq(count, 128 >> shift); 11210 11211 align32(); 11212 bind(L_fill_128_bytes_loop); 11213 fill64(to, 0, xtmp); 11214 fill64(to, 64, xtmp); 11215 addq(to, 128); 11216 subq(count, 128 >> shift); 11217 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop); 11218 11219 addq(count, 128 >> shift); 11220 jcc(Assembler::zero, L_exit); 11221 jmp(L_fill_start); 11222 } 11223 11224 if (MaxVectorSize == 64) { 11225 // Sequence using 64 byte ZMM register. 11226 Label L_fill_128_bytes_zmm; 11227 Label L_fill_192_bytes_zmm; 11228 Label L_fill_192_bytes_loop_zmm; 11229 Label L_fill_192_bytes_loop_header_zmm; 11230 Label L_fill_192_bytes_loop_pre_header_zmm; 11231 Label L_fill_start_zmm_sequence; 11232 11233 bind(L_fill_zmm_sequence); 11234 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit); 11235 11236 bind(L_fill_start_zmm_sequence); 11237 cmpq(count, 64 >> shift); 11238 jccb(Assembler::greater, L_fill_128_bytes_zmm); 11239 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true); 11240 jmp(L_exit); 11241 11242 bind(L_fill_128_bytes_zmm); 11243 cmpq(count, 128 >> shift); 11244 jccb(Assembler::greater, L_fill_192_bytes_zmm); 11245 fill64(to, 0, xtmp, true); 11246 subq(count, 64 >> shift); 11247 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true); 11248 jmp(L_exit); 11249 11250 bind(L_fill_192_bytes_zmm); 11251 cmpq(count, 192 >> shift); 11252 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); 11253 fill64(to, 0, xtmp, true); 11254 fill64(to, 64, xtmp, true); 11255 subq(count, 128 >> shift); 11256 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true); 11257 jmp(L_exit); 11258 11259 bind(L_fill_192_bytes_loop_pre_header_zmm); 11260 { 11261 movq(rtmp, to); 11262 andq(rtmp, 63); 11263 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm); 11264 negq(rtmp); 11265 addq(rtmp, 64); 11266 mov64(r8, -1L); 11267 bzhiq(r8, r8, rtmp); 11268 kmovql(k2, r8); 11269 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit); 11270 addq(to, rtmp); 11271 shrq(rtmp, shift); 11272 subq(count, rtmp); 11273 } 11274 11275 cmpq(count, 192 >> shift); 11276 jcc(Assembler::less, L_fill_start_zmm_sequence); 11277 11278 bind(L_fill_192_bytes_loop_header_zmm); 11279 subq(count, 192 >> shift); 11280 11281 align32(); 11282 bind(L_fill_192_bytes_loop_zmm); 11283 fill64(to, 0, xtmp, true); 11284 fill64(to, 64, xtmp, true); 11285 fill64(to, 128, xtmp, true); 11286 addq(to, 192); 11287 subq(count, 192 >> shift); 11288 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm); 11289 11290 addq(count, 192 >> shift); 11291 jcc(Assembler::zero, L_exit); 11292 jmp(L_fill_start_zmm_sequence); 11293 } 11294 bind(L_exit); 11295 } 11296 #endif 11297 #endif //COMPILER2_OR_JVMCI 11298 11299 11300 #ifdef _LP64 11301 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) { 11302 Label done; 11303 cvttss2sil(dst, src); 11304 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 11305 cmpl(dst, 0x80000000); // float_sign_flip 11306 jccb(Assembler::notEqual, done); 11307 subptr(rsp, 8); 11308 movflt(Address(rsp, 0), src); 11309 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup()))); 11310 pop(dst); 11311 bind(done); 11312 } 11313 11314 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) { 11315 Label done; 11316 cvttsd2sil(dst, src); 11317 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 11318 cmpl(dst, 0x80000000); // float_sign_flip 11319 jccb(Assembler::notEqual, done); 11320 subptr(rsp, 8); 11321 movdbl(Address(rsp, 0), src); 11322 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup()))); 11323 pop(dst); 11324 bind(done); 11325 } 11326 11327 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) { 11328 Label done; 11329 cvttss2siq(dst, src); 11330 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 11331 jccb(Assembler::notEqual, done); 11332 subptr(rsp, 8); 11333 movflt(Address(rsp, 0), src); 11334 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup()))); 11335 pop(dst); 11336 bind(done); 11337 } 11338 11339 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) { 11340 // Following code is line by line assembly translation rounding algorithm. 11341 // Please refer to java.lang.Math.round(float) algorithm for details. 11342 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000; 11343 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24; 11344 const int32_t FloatConsts_EXP_BIAS = 127; 11345 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF; 11346 const int32_t MINUS_32 = 0xFFFFFFE0; 11347 Label L_special_case, L_block1, L_exit; 11348 movl(rtmp, FloatConsts_EXP_BIT_MASK); 11349 movdl(dst, src); 11350 andl(dst, rtmp); 11351 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1); 11352 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS); 11353 subl(rtmp, dst); 11354 movl(rcx, rtmp); 11355 movl(dst, MINUS_32); 11356 testl(rtmp, dst); 11357 jccb(Assembler::notEqual, L_special_case); 11358 movdl(dst, src); 11359 andl(dst, FloatConsts_SIGNIF_BIT_MASK); 11360 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1); 11361 movdl(rtmp, src); 11362 testl(rtmp, rtmp); 11363 jccb(Assembler::greaterEqual, L_block1); 11364 negl(dst); 11365 bind(L_block1); 11366 sarl(dst); 11367 addl(dst, 0x1); 11368 sarl(dst, 0x1); 11369 jmp(L_exit); 11370 bind(L_special_case); 11371 convert_f2i(dst, src); 11372 bind(L_exit); 11373 } 11374 11375 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) { 11376 // Following code is line by line assembly translation rounding algorithm. 11377 // Please refer to java.lang.Math.round(double) algorithm for details. 11378 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L; 11379 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53; 11380 const int64_t DoubleConsts_EXP_BIAS = 1023; 11381 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL; 11382 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L; 11383 Label L_special_case, L_block1, L_exit; 11384 mov64(rtmp, DoubleConsts_EXP_BIT_MASK); 11385 movq(dst, src); 11386 andq(dst, rtmp); 11387 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1); 11388 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS); 11389 subq(rtmp, dst); 11390 movq(rcx, rtmp); 11391 mov64(dst, MINUS_64); 11392 testq(rtmp, dst); 11393 jccb(Assembler::notEqual, L_special_case); 11394 movq(dst, src); 11395 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK); 11396 andq(dst, rtmp); 11397 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1); 11398 orq(dst, rtmp); 11399 movq(rtmp, src); 11400 testq(rtmp, rtmp); 11401 jccb(Assembler::greaterEqual, L_block1); 11402 negq(dst); 11403 bind(L_block1); 11404 sarq(dst); 11405 addq(dst, 0x1); 11406 sarq(dst, 0x1); 11407 jmp(L_exit); 11408 bind(L_special_case); 11409 convert_d2l(dst, src); 11410 bind(L_exit); 11411 } 11412 11413 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) { 11414 Label done; 11415 cvttsd2siq(dst, src); 11416 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 11417 jccb(Assembler::notEqual, done); 11418 subptr(rsp, 8); 11419 movdbl(Address(rsp, 0), src); 11420 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup()))); 11421 pop(dst); 11422 bind(done); 11423 } 11424 11425 void MacroAssembler::cache_wb(Address line) 11426 { 11427 // 64 bit cpus always support clflush 11428 assert(VM_Version::supports_clflush(), "clflush should be available"); 11429 bool optimized = VM_Version::supports_clflushopt(); 11430 bool no_evict = VM_Version::supports_clwb(); 11431 11432 // prefer clwb (writeback without evict) otherwise 11433 // prefer clflushopt (potentially parallel writeback with evict) 11434 // otherwise fallback on clflush (serial writeback with evict) 11435 11436 if (optimized) { 11437 if (no_evict) { 11438 clwb(line); 11439 } else { 11440 clflushopt(line); 11441 } 11442 } else { 11443 // no need for fence when using CLFLUSH 11444 clflush(line); 11445 } 11446 } 11447 11448 void MacroAssembler::cache_wbsync(bool is_pre) 11449 { 11450 assert(VM_Version::supports_clflush(), "clflush should be available"); 11451 bool optimized = VM_Version::supports_clflushopt(); 11452 bool no_evict = VM_Version::supports_clwb(); 11453 11454 // pick the correct implementation 11455 11456 if (!is_pre && (optimized || no_evict)) { 11457 // need an sfence for post flush when using clflushopt or clwb 11458 // otherwise no no need for any synchroniaztion 11459 11460 sfence(); 11461 } 11462 } 11463 11464 #endif // _LP64 11465 11466 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 11467 switch (cond) { 11468 // Note some conditions are synonyms for others 11469 case Assembler::zero: return Assembler::notZero; 11470 case Assembler::notZero: return Assembler::zero; 11471 case Assembler::less: return Assembler::greaterEqual; 11472 case Assembler::lessEqual: return Assembler::greater; 11473 case Assembler::greater: return Assembler::lessEqual; 11474 case Assembler::greaterEqual: return Assembler::less; 11475 case Assembler::below: return Assembler::aboveEqual; 11476 case Assembler::belowEqual: return Assembler::above; 11477 case Assembler::above: return Assembler::belowEqual; 11478 case Assembler::aboveEqual: return Assembler::below; 11479 case Assembler::overflow: return Assembler::noOverflow; 11480 case Assembler::noOverflow: return Assembler::overflow; 11481 case Assembler::negative: return Assembler::positive; 11482 case Assembler::positive: return Assembler::negative; 11483 case Assembler::parity: return Assembler::noParity; 11484 case Assembler::noParity: return Assembler::parity; 11485 } 11486 ShouldNotReachHere(); return Assembler::overflow; 11487 } 11488 11489 // This is simply a call to Thread::current() 11490 void MacroAssembler::get_thread(Register thread) { 11491 if (thread != rax) { 11492 push(rax); 11493 } 11494 LP64_ONLY(push(rdi);) 11495 LP64_ONLY(push(rsi);) 11496 push(rdx); 11497 push(rcx); 11498 #ifdef _LP64 11499 push(r8); 11500 push(r9); 11501 push(r10); 11502 push(r11); 11503 #endif 11504 11505 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0); 11506 11507 #ifdef _LP64 11508 pop(r11); 11509 pop(r10); 11510 pop(r9); 11511 pop(r8); 11512 #endif 11513 pop(rcx); 11514 pop(rdx); 11515 LP64_ONLY(pop(rsi);) 11516 LP64_ONLY(pop(rdi);) 11517 if (thread != rax) { 11518 mov(thread, rax); 11519 pop(rax); 11520 } 11521 } 11522 11523 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) { 11524 Label L_stack_ok; 11525 if (bias == 0) { 11526 testptr(sp, 2 * wordSize - 1); 11527 } else { 11528 // lea(tmp, Address(rsp, bias); 11529 mov(tmp, sp); 11530 addptr(tmp, bias); 11531 testptr(tmp, 2 * wordSize - 1); 11532 } 11533 jcc(Assembler::equal, L_stack_ok); 11534 block_comment(msg); 11535 stop(msg); 11536 bind(L_stack_ok); 11537 } 11538 11539 // Implements lightweight-locking. 11540 // 11541 // obj: the object to be locked 11542 // reg_rax: rax 11543 // thread: the thread which attempts to lock obj 11544 // tmp: a temporary register 11545 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) { 11546 assert(reg_rax == rax, ""); 11547 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp); 11548 11549 Label push; 11550 const Register top = tmp; 11551 11552 // Preload the markWord. It is important that this is the first 11553 // instruction emitted as it is part of C1's null check semantics. 11554 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes())); 11555 11556 if (UseObjectMonitorTable) { 11557 // Clear cache in case fast locking succeeds. 11558 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0); 11559 } 11560 11561 // Load top. 11562 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 11563 11564 // Check if the lock-stack is full. 11565 cmpl(top, LockStack::end_offset()); 11566 jcc(Assembler::greaterEqual, slow); 11567 11568 // Check for recursion. 11569 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 11570 jcc(Assembler::equal, push); 11571 11572 // Check header for monitor (0b10). 11573 testptr(reg_rax, markWord::monitor_value); 11574 jcc(Assembler::notZero, slow); 11575 11576 // Try to lock. Transition lock bits 0b01 => 0b00 11577 movptr(tmp, reg_rax); 11578 andptr(tmp, ~(int32_t)markWord::unlocked_value); 11579 orptr(reg_rax, markWord::unlocked_value); 11580 if (EnableValhalla) { 11581 // Mask inline_type bit such that we go to the slow path if object is an inline type 11582 andptr(reg_rax, ~((int) markWord::inline_type_bit_in_place)); 11583 } 11584 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes())); 11585 jcc(Assembler::notEqual, slow); 11586 11587 // Restore top, CAS clobbers register. 11588 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 11589 11590 bind(push); 11591 // After successful lock, push object on lock-stack. 11592 movptr(Address(thread, top), obj); 11593 incrementl(top, oopSize); 11594 movl(Address(thread, JavaThread::lock_stack_top_offset()), top); 11595 } 11596 11597 // Implements lightweight-unlocking. 11598 // 11599 // obj: the object to be unlocked 11600 // reg_rax: rax 11601 // thread: the thread 11602 // tmp: a temporary register 11603 void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register thread, Register tmp, Label& slow) { 11604 assert(reg_rax == rax, ""); 11605 assert_different_registers(obj, reg_rax, thread, tmp); 11606 11607 Label unlocked, push_and_slow; 11608 const Register top = tmp; 11609 11610 // Check if obj is top of lock-stack. 11611 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 11612 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 11613 jcc(Assembler::notEqual, slow); 11614 11615 // Pop lock-stack. 11616 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 11617 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 11618 11619 // Check if recursive. 11620 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 11621 jcc(Assembler::equal, unlocked); 11622 11623 // Not recursive. Check header for monitor (0b10). 11624 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes())); 11625 testptr(reg_rax, markWord::monitor_value); 11626 jcc(Assembler::notZero, push_and_slow); 11627 11628 #ifdef ASSERT 11629 // Check header not unlocked (0b01). 11630 Label not_unlocked; 11631 testptr(reg_rax, markWord::unlocked_value); 11632 jcc(Assembler::zero, not_unlocked); 11633 stop("lightweight_unlock already unlocked"); 11634 bind(not_unlocked); 11635 #endif 11636 11637 // Try to unlock. Transition lock bits 0b00 => 0b01 11638 movptr(tmp, reg_rax); 11639 orptr(tmp, markWord::unlocked_value); 11640 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes())); 11641 jcc(Assembler::equal, unlocked); 11642 11643 bind(push_and_slow); 11644 // Restore lock-stack and handle the unlock in runtime. 11645 #ifdef ASSERT 11646 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 11647 movptr(Address(thread, top), obj); 11648 #endif 11649 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 11650 jmp(slow); 11651 11652 bind(unlocked); 11653 } 11654 11655 #ifdef _LP64 11656 // Saves legacy GPRs state on stack. 11657 void MacroAssembler::save_legacy_gprs() { 11658 subq(rsp, 16 * wordSize); 11659 movq(Address(rsp, 15 * wordSize), rax); 11660 movq(Address(rsp, 14 * wordSize), rcx); 11661 movq(Address(rsp, 13 * wordSize), rdx); 11662 movq(Address(rsp, 12 * wordSize), rbx); 11663 movq(Address(rsp, 10 * wordSize), rbp); 11664 movq(Address(rsp, 9 * wordSize), rsi); 11665 movq(Address(rsp, 8 * wordSize), rdi); 11666 movq(Address(rsp, 7 * wordSize), r8); 11667 movq(Address(rsp, 6 * wordSize), r9); 11668 movq(Address(rsp, 5 * wordSize), r10); 11669 movq(Address(rsp, 4 * wordSize), r11); 11670 movq(Address(rsp, 3 * wordSize), r12); 11671 movq(Address(rsp, 2 * wordSize), r13); 11672 movq(Address(rsp, wordSize), r14); 11673 movq(Address(rsp, 0), r15); 11674 } 11675 11676 // Resotres back legacy GPRs state from stack. 11677 void MacroAssembler::restore_legacy_gprs() { 11678 movq(r15, Address(rsp, 0)); 11679 movq(r14, Address(rsp, wordSize)); 11680 movq(r13, Address(rsp, 2 * wordSize)); 11681 movq(r12, Address(rsp, 3 * wordSize)); 11682 movq(r11, Address(rsp, 4 * wordSize)); 11683 movq(r10, Address(rsp, 5 * wordSize)); 11684 movq(r9, Address(rsp, 6 * wordSize)); 11685 movq(r8, Address(rsp, 7 * wordSize)); 11686 movq(rdi, Address(rsp, 8 * wordSize)); 11687 movq(rsi, Address(rsp, 9 * wordSize)); 11688 movq(rbp, Address(rsp, 10 * wordSize)); 11689 movq(rbx, Address(rsp, 12 * wordSize)); 11690 movq(rdx, Address(rsp, 13 * wordSize)); 11691 movq(rcx, Address(rsp, 14 * wordSize)); 11692 movq(rax, Address(rsp, 15 * wordSize)); 11693 addq(rsp, 16 * wordSize); 11694 } 11695 11696 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) { 11697 if (VM_Version::supports_apx_f()) { 11698 esetzucc(comparison, dst); 11699 } else { 11700 setb(comparison, dst); 11701 movzbl(dst, dst); 11702 } 11703 } 11704 #endif