本节将要介绍指令选择中combine优化的概念, combine的目的是简化DAG, 合并/消除冗余节点, 为生成更优的指令做准备. 大部分combine是与架构无关的优化, 但LLVM也提供了修改combine的custom接口.
尽管本节介绍的是combine的流程, 但combine与legalize及lowering存在关联, 我们在介绍时也会涉及相关的概念.
使用上节提到的图形化方式来阅读DAG固然利于理解, 但是却不方便调试. 这里更推荐使用LLVM的日志系统打印文字版的DAG描述. 在编译时添加-mllvm -debug-only=isel
即可打印SelectionDAGISel模块的调试信息, 以下截取部分.
Initial selection DAG: %bb.0 ‘test:entry‘
SelectionDAG has 29 nodes:
t0: ch = EntryToken
t2: i32,ch = CopyFromReg t0, Register:i32 %0
t12: i32 = Constant<0>
t14: i32,ch = load<(load 4 from %ir.p1, !tbaa !2)> t0, t2, undef:i32
t6: i32,ch = CopyFromReg t0, Register:i32 %2
t18: i32 = mul nsw t14, t6
t8: i32,ch = CopyFromReg t0, Register:i32 %3
t10: i32 = AssertZext t8, ValueType:ch:i8
t11: i8 = truncate t10
t16: i8 = and t11, Constant:i8<31>
t17: i32 = zero_extend t16
t19: i32 = shl t18, t17
t24: i32 = GlobalAddress<i32 (i32)* @test2> 0
t4: i32,ch = CopyFromReg t0, Register:i32 %1
t21: i32 = shl t4, Constant:i32<2>
t22: i32 = add t2, t21
t23: ch = store<(store 4 into %ir.arrayidx1, !tbaa !2)> t14:1, t19, t22, undef:i32
t26: ch,glue = CopyToReg t23, Register:i32 $x10, t19
t28: ch,glue = RISCVISD::TAIL t26, TargetGlobalAddress:i32<i32 (i32)* @test2> 0 [TF=1], Register:i32 $x10, t26:1
Optimized lowered selection DAG: %bb.0 ‘test:entry‘
SelectionDAG has 25 nodes:
t0: ch = EntryToken
t2: i32,ch = CopyFromReg t0, Register:i32 %0
t14: i32,ch = load<(load 4 from %ir.p1, !tbaa !2)> t0, t2, undef:i32
t6: i32,ch = CopyFromReg t0, Register:i32 %2
t18: i32 = mul nsw t14, t6
t8: i32,ch = CopyFromReg t0, Register:i32 %3
t10: i32 = AssertZext t8, ValueType:ch:i8
t30: i32 = and t10, Constant:i32<31>
t19: i32 = shl t18, t30
t4: i32,ch = CopyFromReg t0, Register:i32 %1
t21: i32 = shl t4, Constant:i32<2>
t22: i32 = add t2, t21
t23: ch = store<(store 4 into %ir.arrayidx1, !tbaa !2)> t14:1, t19, t22, undef:i32
t26: ch,glue = CopyToReg t23, Register:i32 $x10, t19
t28: ch,glue = RISCVISD::TAIL t26, TargetGlobalAddress:i32<i32 (i32)* @test2> 0 [TF=1], Register:i32 $x10, t26:1
SelectionDAG类实现了一个名为dump()(defined in lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp)的接口用于打印DAG图. 关于接口的具体实现不再赘述, 这里介绍下怎么阅读文字版的DAG.
SelectionDAGISel::CodeGenAndEmitDAG()中会交替调用SelectionDAG::Combine(), SelectionDAG::Legalize()等接口(即概述中的那张流程图). 多次Combine()的原因是每次legalize后DAG都可能发生变化, 所以需要尝试多次优化DAG. SelectionDAG::Combine()会创建一个DAGCombiner的类并调用DAGCombiner::Run(). 注意Combine会传入一个CombineLevel(defined in include/llvm/CodeGen/DAGCombine.h)的枚举表明调用Combine()接口时的时间点, 对应不同时间点DAGCombiner的优化也不同.
void DAGCombiner::Run(CombineLevel AtLevel) {
......
// 将DAG中所有节点加入worklist
for (SDNode &Node : DAG.allnodes())
AddToWorklist(&Node);
// 创建一个引用root的dummy节点来防止root节点被优化
HandleSDNode Dummy(DAG.getRoot());
while (SDNode *N = getNextWorklistEntry()) {
// 如果一个节点没有user则删除该节点, 当一个节点被删除时递归检查它的operand是否也可以被删除
if (recursivelyDeleteUnusedNodes(N))
continue;
WorklistRemover DeadNodes(*this);
// 如果在legalize operation之后combine节点, 必须要保证combine后的节点也是legalize的
if (Level == AfterLegalizeDAG) {
SmallSetVector<SDNode *, 16> UpdatedNodes;
bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
for (SDNode *LN : UpdatedNodes) {
AddUsersToWorklist(LN);
AddToWorklist(LN);
}
if (!NIsValid)
continue;
}
// 检查被combine的节点的operand是否需要被加入尝试合并/替换当前节点来化简DAG.worklist
CombinedNodes.insert(N);
for (const SDValue &ChildN : N->op_values())
if (!CombinedNodes.count(ChildN.getNode()))
AddToWorklist(ChildN.getNode());
// combine入口
SDValue RV = combine(N);
// 返回SDNode指针为空说明combine()没有生成新的节点, 跳过替换步骤
if (!RV.getNode())
continue;
// 返回的SDNode指针为原节点表明CombineTo()已经处理了该节点, 无需再次替换
if (RV.getNode() == N)
continue;
// 检查新节点返回的值的个数, 并替换旧结点
if (N->getNumValues() == RV.getNode()->getNumValues())
DAG.ReplaceAllUsesWith(N, RV.getNode());
else {
assert(N->getValueType(0) == RV.getValueType() &&
N->getNumValues() == 1 && "Type mismatch");
DAG.ReplaceAllUsesWith(N, &RV);
}
// 将新节点及其user都加入worklist
AddToWorklist(RV.getNode());
AddUsersToWorklist(RV.getNode());
// 如果节点没有user那么递归的删除被替换的节点
recursivelyDeleteUnusedNodes(N);
}
// 清理DAG
DAG.setRoot(Dummy.getValue());
DAG.RemoveDeadNodes();
}
// 将N加入到worklist的尾部, 保证DFS顺序
void AddToWorklist(SDNode *N) {
// 跳过dummy node
if (N->getOpcode() == ISD::HANDLENODE)
return;
ConsiderForPruning(N);
// 返回true表示插入成功, false表示已在map里
if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
Worklist.push_back(N);
}
bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
// user非空, 不能删除该节点
if (!N->use_empty())
return false;
SmallSetVector<SDNode *, 16> Nodes;
Nodes.insert(N);
do {
N = Nodes.pop_back_val();
if (!N)
continue;
// user为空, 删除节点同时将其从worklist中移除
if (N->use_empty()) {
for (const SDValue &ChildN : N->op_values())
Nodes.insert(ChildN.getNode());
removeFromWorklist(N);
DAG.DeleteNode(N);
} else {
// user非空, 说明其一个user已被删除, 可以尝试再次combine
AddToWorklist(N);
}
} while (!Nodes.empty());
return true;
}
DAGCombiner::Run()的算法在legalize与combine中经常见到, 其思路是DFS遍历DAG中的节点. 首先遍历DAG将所有节点加入容器worklist中. 然后每次取出一个节点, 依次判断:
SDValue DAGCombiner::combine(SDNode *N) {
// 架构无关优化
SDValue RV = visit(N);
// 架构相关优化
if (!RV.getNode()) {
if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
TargetLowering::DAGCombinerInfo
DagCombineInfo(DAG, Level, false, this);
RV = TLI.PerformDAGCombine(N, DagCombineInfo);
}
}
// promote operation
if (!RV.getNode()) {
switch (N->getOpcode()) {
default: break;
case ISD::ADD:
case ISD::SUB:
case ISD::MUL:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
RV = PromoteIntBinOp(SDValue(N, 0));
break;
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
RV = PromoteIntShiftOp(SDValue(N, 0));
break;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
RV = PromoteExtend(SDValue(N, 0));
break;
case ISD::LOAD:
if (PromoteLoad(SDValue(N, 0)))
RV = SDValue(N, 0);
break;
}
}
......
return RV;
}
combine()的逻辑分为三部分:
class TargetLoweringBase {
// 记录是否需要target comhbine的target independent的Node的标记位数组
unsigned char TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT];
public:
bool hasTargetDAGCombine(ISD::NodeType NT) const {
assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7));
}
void setTargetDAGCombine(ISD::NodeType NT) {
assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7);
}
// 注意该接口的返回值与架构无关的combine实现(visit())是一致的:
// 返回空的SDValue代表没有修改, 返回指向入参的SDValue代表节点已被替换, 否则在combine()中被替换
virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
};
以X86为例, X86TargetLowering::X86TargetLowering()中自定义了许多架构无关的Node. 我们看一个LOAD的例子.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
LoadSDNode *Ld = cast<LoadSDNode>(N);
EVT RegVT = Ld->getValueType(0);
EVT MemVT = Ld->getMemoryVT();
SDLoc dl(Ld);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// 对于32byte非对齐load性能较差的架构, 使用2次16byte的load来提升性能
// fast返回架构是否支持正常的32byte非对齐laod
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
*Ld->getMemOperand(), &Fast) &&
!Fast))) {
unsigned NumElems = RegVT.getVectorNumElements();
if (NumElems < 2)
return SDValue();
unsigned HalfAlign = 16;
SDValue Ptr1 = Ld->getBasePtr();
SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
// 拆分为两条load
SDValue Load1 =
DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
Alignment, Ld->getMemOperand()->getFlags());
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
Ld->getPointerInfo().getWithOffset(HalfAlign),
MinAlign(Alignment, HalfAlign),
Ld->getMemOperand()->getFlags());
// 建立依赖
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1), Load2.getValue(1));
// 合并两个load结果
SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
// 调用CombineTo()替换原来的Node
return DCI.CombineTo(N, NewVec, TF, true);
}
if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
unsigned NumElts = RegVT.getVectorNumElements();
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
if (TLI.isTypeLegal(IntVT)) {
SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Alignment,
Ld->getMemOperand()->getFlags());
SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
}
}
return SDValue();
}
小结:
原文:https://www.cnblogs.com/Five100Miles/p/12840216.html