本节将要介绍指令选择中combine优化的概念, combine的目的是简化DAG, 合并/消除冗余节点, 为生成更优的指令做准备. 大部分combine是与架构无关的优化, 但LLVM也提供了修改combine的custom接口.
尽管本节介绍的是combine的流程, 但combine与legalize及lowering存在关联, 我们在介绍时也会涉及相关的概念.
使用上节提到的图形化方式来阅读DAG固然利于理解, 但是却不方便调试. 这里更推荐使用LLVM的日志系统打印文字版的DAG描述. 在编译时添加-mllvm -debug-only=isel即可打印SelectionDAGISel模块的调试信息, 以下截取部分.
Initial selection DAG: %bb.0 ‘test:entry‘
SelectionDAG has 29 nodes:
  t0: ch = EntryToken
  t2: i32,ch = CopyFromReg t0, Register:i32 %0
  t12: i32 = Constant<0>
  t14: i32,ch = load<(load 4 from %ir.p1, !tbaa !2)> t0, t2, undef:i32
      t6: i32,ch = CopyFromReg t0, Register:i32 %2
    t18: i32 = mul nsw t14, t6
            t8: i32,ch = CopyFromReg t0, Register:i32 %3
          t10: i32 = AssertZext t8, ValueType:ch:i8
        t11: i8 = truncate t10
      t16: i8 = and t11, Constant:i8<31>
    t17: i32 = zero_extend t16
  t19: i32 = shl t18, t17
  t24: i32 = GlobalAddress<i32 (i32)* @test2> 0
          t4: i32,ch = CopyFromReg t0, Register:i32 %1
        t21: i32 = shl t4, Constant:i32<2>
      t22: i32 = add t2, t21
    t23: ch = store<(store 4 into %ir.arrayidx1, !tbaa !2)> t14:1, t19, t22, undef:i32
  t26: ch,glue = CopyToReg t23, Register:i32 $x10, t19
  t28: ch,glue = RISCVISD::TAIL t26, TargetGlobalAddress:i32<i32 (i32)* @test2> 0 [TF=1], Register:i32 $x10, t26:1
Optimized lowered selection DAG: %bb.0 ‘test:entry‘
SelectionDAG has 25 nodes:
  t0: ch = EntryToken
  t2: i32,ch = CopyFromReg t0, Register:i32 %0
  t14: i32,ch = load<(load 4 from %ir.p1, !tbaa !2)> t0, t2, undef:i32
      t6: i32,ch = CopyFromReg t0, Register:i32 %2
    t18: i32 = mul nsw t14, t6
        t8: i32,ch = CopyFromReg t0, Register:i32 %3
      t10: i32 = AssertZext t8, ValueType:ch:i8
    t30: i32 = and t10, Constant:i32<31>
  t19: i32 = shl t18, t30
          t4: i32,ch = CopyFromReg t0, Register:i32 %1
        t21: i32 = shl t4, Constant:i32<2>
      t22: i32 = add t2, t21
    t23: ch = store<(store 4 into %ir.arrayidx1, !tbaa !2)> t14:1, t19, t22, undef:i32
  t26: ch,glue = CopyToReg t23, Register:i32 $x10, t19
  t28: ch,glue = RISCVISD::TAIL t26, TargetGlobalAddress:i32<i32 (i32)* @test2> 0 [TF=1], Register:i32 $x10, t26:1
SelectionDAG类实现了一个名为dump()(defined in lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp)的接口用于打印DAG图. 关于接口的具体实现不再赘述, 这里介绍下怎么阅读文字版的DAG.
SelectionDAGISel::CodeGenAndEmitDAG()中会交替调用SelectionDAG::Combine(), SelectionDAG::Legalize()等接口(即概述中的那张流程图). 多次Combine()的原因是每次legalize后DAG都可能发生变化, 所以需要尝试多次优化DAG. SelectionDAG::Combine()会创建一个DAGCombiner的类并调用DAGCombiner::Run(). 注意Combine会传入一个CombineLevel(defined in include/llvm/CodeGen/DAGCombine.h)的枚举表明调用Combine()接口时的时间点, 对应不同时间点DAGCombiner的优化也不同.
void DAGCombiner::Run(CombineLevel AtLevel) {
  ......
  // 将DAG中所有节点加入worklist
  for (SDNode &Node : DAG.allnodes())
    AddToWorklist(&Node);
  // 创建一个引用root的dummy节点来防止root节点被优化
  HandleSDNode Dummy(DAG.getRoot());
  while (SDNode *N = getNextWorklistEntry()) {
    // 如果一个节点没有user则删除该节点, 当一个节点被删除时递归检查它的operand是否也可以被删除
    if (recursivelyDeleteUnusedNodes(N))
      continue;
    WorklistRemover DeadNodes(*this);
    // 如果在legalize operation之后combine节点, 必须要保证combine后的节点也是legalize的
    if (Level == AfterLegalizeDAG) {
      SmallSetVector<SDNode *, 16> UpdatedNodes;
      bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
      for (SDNode *LN : UpdatedNodes) {
        AddUsersToWorklist(LN);
        AddToWorklist(LN);
      }
      if (!NIsValid)
        continue;
    }
    // 检查被combine的节点的operand是否需要被加入尝试合并/替换当前节点来化简DAG.worklist
    CombinedNodes.insert(N);
    for (const SDValue &ChildN : N->op_values())
      if (!CombinedNodes.count(ChildN.getNode()))
        AddToWorklist(ChildN.getNode());
    // combine入口
    SDValue RV = combine(N);
    // 返回SDNode指针为空说明combine()没有生成新的节点, 跳过替换步骤
    if (!RV.getNode())
      continue;
    // 返回的SDNode指针为原节点表明CombineTo()已经处理了该节点, 无需再次替换
    if (RV.getNode() == N)
      continue;
    // 检查新节点返回的值的个数, 并替换旧结点
    if (N->getNumValues() == RV.getNode()->getNumValues())
      DAG.ReplaceAllUsesWith(N, RV.getNode());
    else {
      assert(N->getValueType(0) == RV.getValueType() &&
             N->getNumValues() == 1 && "Type mismatch");
      DAG.ReplaceAllUsesWith(N, &RV);
    }
    // 将新节点及其user都加入worklist
    AddToWorklist(RV.getNode());
    AddUsersToWorklist(RV.getNode());
    // 如果节点没有user那么递归的删除被替换的节点
    recursivelyDeleteUnusedNodes(N);
  }
  // 清理DAG
  DAG.setRoot(Dummy.getValue());
  DAG.RemoveDeadNodes();
}
// 将N加入到worklist的尾部, 保证DFS顺序
void AddToWorklist(SDNode *N) {
  // 跳过dummy node
  if (N->getOpcode() == ISD::HANDLENODE)
    return;
  ConsiderForPruning(N);
  // 返回true表示插入成功, false表示已在map里
  if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
    Worklist.push_back(N);
}
bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
  // user非空, 不能删除该节点
  if (!N->use_empty())
    return false;
  SmallSetVector<SDNode *, 16> Nodes;
  Nodes.insert(N);
  do {
    N = Nodes.pop_back_val();
    if (!N)
      continue;
    // user为空, 删除节点同时将其从worklist中移除
    if (N->use_empty()) {
      for (const SDValue &ChildN : N->op_values())
        Nodes.insert(ChildN.getNode());
      removeFromWorklist(N);
      DAG.DeleteNode(N);
    } else {
      // user非空, 说明其一个user已被删除, 可以尝试再次combine
      AddToWorklist(N);
    }
  } while (!Nodes.empty());
  return true;
}
DAGCombiner::Run()的算法在legalize与combine中经常见到, 其思路是DFS遍历DAG中的节点. 首先遍历DAG将所有节点加入容器worklist中. 然后每次取出一个节点, 依次判断:
SDValue DAGCombiner::combine(SDNode *N) {
  // 架构无关优化
  SDValue RV = visit(N);
  // 架构相关优化
  if (!RV.getNode()) {
    if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
        TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
      TargetLowering::DAGCombinerInfo
        DagCombineInfo(DAG, Level, false, this);
      RV = TLI.PerformDAGCombine(N, DagCombineInfo);
    }
  }
  // promote operation
  if (!RV.getNode()) {
    switch (N->getOpcode()) {
    default: break;
    case ISD::ADD:
    case ISD::SUB:
    case ISD::MUL:
    case ISD::AND:
    case ISD::OR:
    case ISD::XOR:
      RV = PromoteIntBinOp(SDValue(N, 0));
      break;
    case ISD::SHL:
    case ISD::SRA:
    case ISD::SRL:
      RV = PromoteIntShiftOp(SDValue(N, 0));
      break;
    case ISD::SIGN_EXTEND:
    case ISD::ZERO_EXTEND:
    case ISD::ANY_EXTEND:
      RV = PromoteExtend(SDValue(N, 0));
      break;
    case ISD::LOAD:
      if (PromoteLoad(SDValue(N, 0)))
        RV = SDValue(N, 0);
      break;
    }
  }
  ......
  return RV;
}
combine()的逻辑分为三部分:
class TargetLoweringBase {
  // 记录是否需要target comhbine的target independent的Node的标记位数组
  unsigned char TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT];
public:
  bool hasTargetDAGCombine(ISD::NodeType NT) const {
    assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
    return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7));
  }
  void setTargetDAGCombine(ISD::NodeType NT) {
    assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
    TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7);
  }
  // 注意该接口的返回值与架构无关的combine实现(visit())是一致的:
  // 返回空的SDValue代表没有修改, 返回指向入参的SDValue代表节点已被替换, 否则在combine()中被替换
  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
};
以X86为例, X86TargetLowering::X86TargetLowering()中自定义了许多架构无关的Node. 我们看一个LOAD的例子.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
  LoadSDNode *Ld = cast<LoadSDNode>(N);
  EVT RegVT = Ld->getValueType(0);
  EVT MemVT = Ld->getMemoryVT();
  SDLoc dl(Ld);
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  // 对于32byte非对齐load性能较差的架构, 使用2次16byte的load来提升性能
  // fast返回架构是否支持正常的32byte非对齐laod
  ISD::LoadExtType Ext = Ld->getExtensionType();
  bool Fast;
  unsigned Alignment = Ld->getAlignment();
  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
      Ext == ISD::NON_EXTLOAD &&
      ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
       (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
                               *Ld->getMemOperand(), &Fast) &&
        !Fast))) {
    unsigned NumElems = RegVT.getVectorNumElements();
    if (NumElems < 2)
      return SDValue();
    unsigned HalfAlign = 16;
    SDValue Ptr1 = Ld->getBasePtr();
    SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                  NumElems / 2);
    // 拆分为两条load
    SDValue Load1 =
        DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
                    Alignment, Ld->getMemOperand()->getFlags());
    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
                                Ld->getPointerInfo().getWithOffset(HalfAlign),
                                MinAlign(Alignment, HalfAlign),
                                Ld->getMemOperand()->getFlags());
    // 建立依赖
    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                             Load1.getValue(1), Load2.getValue(1));
    // 合并两个load结果
    SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
    // 调用CombineTo()替换原来的Node
    return DCI.CombineTo(N, NewVec, TF, true);
  }
  if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
      RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
    unsigned NumElts = RegVT.getVectorNumElements();
    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
    if (TLI.isTypeLegal(IntVT)) {
      SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
                                    Ld->getPointerInfo(), Alignment,
                                    Ld->getMemOperand()->getFlags());
      SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
      return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
    }
  }
  return SDValue();
}
小结:
原文:https://www.cnblogs.com/Five100Miles/p/12840216.html