ART虚拟机中的栈上替换编译(OSR)

站长

2024年06月05日 18:59 · 阅读数 73

Art的Jit编译分为3个档位，kBaseline、kOptimized、kOsr。解释执行的方法的hotness_count溢出时，就会编译触发kBaseline编译。kBaseline编译的代码只固化了一些偏移地址以及将变量分配在寄存器上，基本上没做什么优化，因此kBaseline编译的代码执行hotness_count溢出时，就要触发kOptimized编译，进一步优化代码。如果编译的机器码一直得不到执行，比如方法一直在执行循环，这时就需要在方法返回之前，切换到机器码执行，以提高性能。 ART虚拟机中的栈上替换编译(OSR) 但是kBaseline或者kOptimized编译出来的机器码，只能从方法的入口处开始执行，无法从机器码的中间开始执行(无法恢复中间指令处寄存器和堆栈等上下文环境)，这时就需要使用Osr编译构造一个可以从中间指令处开始执行的优化代码。栈上替换的意思就是，构造机器码的执行栈，替换掉解释器的执行栈，并跳转到对应的机器码入口处。 ART虚拟机中的栈上替换编译(OSR) Osr会把循环的回边作为方法的执行入口，我们以nterp解释器为例，C解释器流程大致相同

.macro BRANCH //跳转指令
    add     xPC, xPC, wINST, sxtw #1    // update xPC
    // Update method counter and do a suspend check if the branch is negative or zero.
    cmp wINST, #0
    b.le 2f
1:
    FETCH wINST, 0                      // load wINST
    GET_INST_OPCODE ip                  // extract opcode from wINST
    GOTO_OPCODE ip                      // jump to next instruction
2:
    ldr x0, [sp]
    ldrh w2, [x0, #ART_METHOD_HOTNESS_COUNT_OFFSET]
#if (NTERP_HOTNESS_VALUE != 0)
#error Expected 0 for hotness value
#endif
    // If the counter is at zero, handle this in the runtime.
    cbz w2, NterpHandleHotnessOverflow //循环回边
    add x2, x2, #-1
    strh w2, [x0, #ART_METHOD_HOTNESS_COUNT_OFFSET]
    DO_SUSPEND_CHECK continue_label=1b
    b 1b
.endm

NterpHandleHotnessOverflow:
    CHECK_AND_UPDATE_SHARED_MEMORY_METHOD if_hot=1f, if_not_hot=5f
1:
    mov x1, xPC
    mov x2, xFP
    bl nterp_hot_method // x1寄存器，也就是参数1为dexPC
    cbnz x0, 3f
    ...

方法变热时，调用nterp_hot_method，nterp_hot_method会根据当前方法的执行热度，选择不同的编译策略

extern "C" jit::OsrData* NterpHotMethod(ArtMethod* method, uint16_t* dex_pc_ptr, uint32_t* vregs)
    REQUIRES_SHARED(Locks::mutator_lock_) {
...
  jit::Jit* jit = runtime->GetJit();
  if (jit != nullptr && jit->UseJitCompilation()) {
    // Nterp passes null on entry where we don't want to OSR.
    if (dex_pc_ptr != nullptr) { //传入的dexPC指针不为null则为OSR编译
      // This could be a loop back edge, check if we can OSR.
      CodeItemInstructionAccessor accessor(method->DexInstructions());
      uint32_t dex_pc = dex_pc_ptr - accessor.Insns();
      jit::OsrData* osr_data = jit->PrepareForOsr(
          method->GetInterfaceMethodIfProxy(kRuntimePointerSize), dex_pc, vregs);//此次构造机器码的执行栈
      if (osr_data != nullptr) { //表示已经有了编译的Osr代码，并且执行栈构造成功
        return osr_data;
      }
    }
    jit->MaybeEnqueueCompilation(method, Thread::Current()); //触发编译
  }
  return nullptr;
}

void Jit::MaybeEnqueueCompilation(ArtMethod* method, Thread* self) {
...
  if (GetCodeCache()->ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
    if (!method->IsNative() && !code_cache_->IsOsrCompiled(method)) {
      // If we already have compiled code for it, nterp may be stuck in a loop.
      // Compile OSR.
      AddCompileTask(self, method, CompilationKind::kOsr);  //编译的机器码得不到执行，说明方法变得很热了，需要Osr编译
    }
    return;
  }
...
  if (!method->IsNative() && GetCodeCache()->CanAllocateProfilingInfo()) {
    AddCompileTask(self, method, CompilationKind::kBaseline); //baseline编译
  } else {
    AddCompileTask(self, method, CompilationKind::kOptimized); //optimized编译
  }
}

接下来看下机器码执行栈是如何构造的，大概逻辑就是把OsrEntry处的所有存活的局部变量以及callee-save寄存器，从解释器执行栈上恢复到机器码执行栈上。至于为啥物理寄存器上没有局部变量，后面再做解释。如下图所示

ART虚拟机中的栈上替换编译(OSR) 解释器执行栈上虚拟寄存器的布局是固定的，现在需要的是机器码执行栈上寄存器的布局，这个信息记录在Osr编译生成的stackmap vreg_map中，具体的代码，主要在PrepareForOsr方法中实现，如下

OsrData* Jit::PrepareForOsr(ArtMethod* method, uint32_t dex_pc, uint32_t* vregs) {
···
    memset(osr_data, 0, sizeof(OsrData) + frame_size);
    osr_data->frame_size = frame_size;

    // Art ABI: ArtMethod is at the bottom of the stack.
    osr_data->memory[0] = method; //恢复ArtMehod*
     // Find stack map starting at the target dex_pc.
    StackMap stack_map = code_info.GetOsrStackMapForDexPc(dex_pc); //找到当前dex_pc处的stackmap
    if (!stack_map.IsValid()) {
      // There is no OSR stack map for this dex pc offset. Just return to the interpreter in the
      // hope that the next branch has one.
      return nullptr;
    }

    // We found a stack map, now fill the frame with dex register values from the interpreter's
    // shadow frame.
    DexRegisterMap vreg_map = code_info.GetDexRegisterMapOf(stack_map);//获取寄存器的布局图
   for (uint16_t vreg = 0; vreg < number_of_vregs; ++vreg) { // 恢复虚拟寄存器
    DexRegisterLocation::Kind location = vreg_map[vreg].GetKind();
    if (location == DexRegisterLocation::Kind::kNone) {
      // Dex register is dead or uninitialized.
      continue;
    }

    if (location == DexRegisterLocation::Kind::kConstant) {
      // We skip constants because the compiled code knows how to handle them.
      continue;
    }

    DCHECK_EQ(location, DexRegisterLocation::Kind::kInStack);

    int32_t vreg_value = vregs[vreg];
    int32_t slot_offset = vreg_map[vreg].GetStackOffsetInBytes();
    DCHECK_LT(slot_offset, static_cast<int32_t>(frame_size));
    DCHECK_GT(slot_offset, 0);
    (reinterpret_cast<int32_t*>(osr_data->memory))[slot_offset / sizeof(int32_t)] = vreg_value;
  }
  osr_data->native_pc = stack_map.GetNativePcOffset(kRuntimeISA) +
        osr_method->GetEntryPoint(); //跳转目标地址

记录stackmap vreg_map的逻辑如下

void CodeGenerator::RecordPcInfo(HInstruction* instruction,
                                 uint32_t dex_pc,
                                 uint32_t native_pc,
                                 SlowPathCode* slow_path,
                                 bool native_debug_info) {
 ...
   bool osr =
      instruction->IsSuspendCheck() &&
      (info != nullptr) &&
      graph_->IsCompilingOsr() &&
      (inlining_depth == 0); //是否是osr编译且在循环回边的suspendcheck处
  StackMap::Kind kind = native_debug_info
      ? StackMap::Kind::Debug
      : (osr ? StackMap::Kind::OSR : StackMap::Kind::Default);
  bool needs_vreg_info = NeedsVregInfo(instruction, osr);
  stack_map_stream->BeginStackMapEntry(outer_dex_pc,
                                       native_pc,
                                       register_mask,
                                       locations->GetStackMask(),
                                       kind,
                                       needs_vreg_info);

  EmitEnvironment(environment, slow_path, needs_vreg_info); //environment中包含存活局部变量信息
 ...

如下这段简单的java代码

public class Osr {

    public static void main(String[] args) {
        Object o = new Object();
        for (int i = 0; i < args.length; i++) {
            String s = args[i];
            if (s.isEmpty()) {
                return;
            }
            o += s;
        } //循环回边处，存活的局部变量有o、i、args
    }
}

生成的stackmap如下

  1: void com.example.Osr.main(java.lang.String[]) (dex_method_idx=1)
  ...
        CodeInfo CodeSize:392 FrameSize:176 CoreSpillMask:7fe00000 FpSpillMask:ff00 NumberOfDexRegisters:5
          StackMap BitSize=345 Rows=15 Bits={Kind=2 PackedNativePc=7 DexPc=6 RegisterMaskIndex=3 StackMaskIndex=1 InlineInfoIndex=0 DexRegisterMaskIndex=1 DexRegisterMapIndex=3}
          RegisterMask BitSize=36 Rows=4 Bits={Value=4 Shift=5}
          StackMask BitSize=7 Rows=1 Bits={Mask=7}
          DexRegisterMask BitSize=5 Rows=1 Bits={Mask=5}
          DexRegisterMapInfo BitSize=6 Rows=3 Bits={CatalogueIndex=2}
          DexRegisterInfo BitSize=12 Rows=3 Bits={Kind=1 PackedValue=3}
    QuickMethodFrameInfo
      frame_size_in_bytes: 176
      core_spill_mask: 0x7fe00000 (r21, r22, r23, r24, r25, r26, r27, r28, r29, r30)
      fp_spill_mask: 0x0000ff00 (fr8, fr9, fr10, fr11, fr12, fr13, fr14, fr15)
      vr_stack_locations:
      	locals: v0[sp + #12] v1[sp + #16] v2[sp + #20] v3[sp + #24]
      	ins: v4[sp + #184]
      	method*: v5[sp + #0]
      	outs: v0[sp + #8] v1[sp + #12]
  ...
      0x00001184: b9001bf8	str w24, [sp, #24]
      0x00001188: f94002b5	ldr x21, [x21]
        StackMap[10] (native_pc=0x118c, dex_pc=0x6, register_mask=0x0, stack_mask=0b1010000, OSR)
          v0:sp+16 v1:sp+20 v4:sp+24 //vreg_map映射关系，v0代表局部变量o存储在sp+16处，v1代表i存储在sp+20，v4代表局部变量args存储在sp+24处

至于osr编译的局部变量为啥全都spill到栈上，从代码的注释看是为了实现上的简化，

CodeGenerator::CodeGenerator(HGraph* graph,
                             size_t number_of_core_registers,
                             size_t number_of_fpu_registers,
                             size_t number_of_register_pairs,
 ...
   if (GetGraph()->IsCompilingOsr()) {
    // Make OSR methods have all registers spilled, this simplifies the logic of
    // jumping to the compiled code directly.
    for (size_t i = 0; i < number_of_core_registers_; ++i) { //spill所有的callee-save寄存器到栈上，和nterp执行栈保持一致
      if (IsCoreCalleeSaveRegister(i)) {
        AddAllocatedRegister(Location::RegisterLocation(i));
      }
    }
    for (size_t i = 0; i < number_of_fpu_registers_; ++i) {
      if (IsFloatingPointCalleeSaveRegister(i)) {
        AddAllocatedRegister(Location::FpuRegisterLocation(i));
      }
    }
  }

上面的代码，保证了osr执行栈上的callee-save和nterp执行栈的是一致的，因此这部分不需要拷贝到osr栈上，下面来看下osr入口处虚拟寄存器是如何分配到栈上的，分配寄存器的逻辑在方法AllocateRegistersInternal中，具体如下

void RegisterAllocatorLinearScan::AllocateRegistersInternal() {
···
    if (block->IsCatchBlock() ||
        (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {//Osr循环回边处，因为多了个Osr入口，循环变得不可约
      // By blocking all registers at the top of each catch block or irreducible loop, we force
      // intervals belonging to the live-in set of the catch/header block to be spilled.
      // TODO(ngeoffray): Phis in this block could be allocated in register.
      size_t position = block->GetLifetimeStart();
      BlockRegisters(position, position + 1); //强制Osr入口处所有变量spill到栈上
    }
  }
...
}

上面那段java代码循环回边处寄存器分配的结果如下，和stackmap记录的一致

ART虚拟机中的栈上替换编译(OSR) 构造好Osr栈帧数据后，后面逻辑就简单了，只需Pop当前nterp的执行栈，然后将构造好的Osr栈数据，拷贝到执行栈上，然后跳转至对应的入口处

NterpHandleHotnessOverflow:
    CHECK_AND_UPDATE_SHARED_MEMORY_METHOD if_hot=1f, if_not_hot=5f
1:
    mov x1, xPC
    mov x2, xFP
    bl nterp_hot_method
    cbnz x0, 3f
2:
...
3:
    // Drop the current frame. //pop当前解释器的执行栈
    ldr ip, [xREFS, #-8]
    mov sp, ip
    ...
    add x2, x0, #OSR_DATA_MEMORY  //x2寄存器保存了栈上的数据
4:
    sub x1, x1, #8 //x1为frame size 去掉 callee-save的大小
    ldr ip, [x2, x1]
    str ip, [sp, x1]
    cbnz x1, 4b    //拷贝到执行栈上    

    // Fetch the native PC to jump to and save it in a callee-save register.
    ldr xFP, [x0, #OSR_DATA_NATIVE_PC]

    // Free the memory holding OSR Data.
    bl free

    // Jump to the compiled code.
    br xFP  //跳转到目标地址上继续执行

转载自:https://juejin.cn/post/7371712684796706816