LLVMのswitchを解析してみたかった。 - タイキハバンセイセズ

rutilicus.hatenablog.com
これの続きです。改めて解析対象のソースコードなど。

int a() {
    int type;

    type = getParam();

    switch (type) {
        case 1:
            hoge();
            // oops
            break;
        case 2:
            newHoge();
            break;
        default:
            piyo();
            poyo();
            break;
    }

    foo();
    bar();

    return 0;
}

これから出力したLLVM IRも。

LLVM IR

; ModuleID = 'aft.c'
source_filename = "aft.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"

; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @a() #0 {
entry:
  %type = alloca i32, align 4
  %call = call i32 (...) @getParam()
  store i32 %call, i32* %type, align 4
  %0 = load i32, i32* %type, align 4
  switch i32 %0, label %sw.default [
    i32 1, label %sw.bb
    i32 2, label %sw.bb2
  ]

sw.bb:                                            ; preds = %entry
  %call1 = call i32 (...) @hoge()
  br label %sw.epilog

sw.bb2:                                           ; preds = %entry
  %call3 = call i32 (...) @newHoge()
  br label %sw.epilog

sw.default:                                       ; preds = %entry
  %call4 = call i32 (...) @piyo()
  %call5 = call i32 (...) @poyo()
  br label %sw.epilog

sw.epilog:                                        ; preds = %sw.default, %sw.bb2, %sw.bb
  %call6 = call i32 (...) @foo()
  %call7 = call i32 (...) @bar()
  ret i32 0
}

declare dso_local i32 @getParam(...) #1

declare dso_local i32 @hoge(...) #1

declare dso_local i32 @newHoge(...) #1

declare dso_local i32 @piyo(...) #1

declare dso_local i32 @poyo(...) #1

declare dso_local i32 @foo(...) #1

declare dso_local i32 @bar(...) #1

attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 10.0.0-4ubuntu1 "}

で、これを解析しようとしたコードがこれです。

#include <iostream>
#include <map>
#include <string>
#include <list>
#include <set>
#include <algorithm>

#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Instruction.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Support/SourceMgr.h>

int main(int argc, char** argv) {
    llvm::LLVMContext context;
    llvm::SMDiagnostic err;
    std::unique_ptr<llvm::Module> module = llvm::parseIRFile("aft.ll", err, context);

    if (!module) {
        err.print(argv[0], llvm::errs());
        return 1;
    }

    for (llvm::Function &function: module->getFunctionList()) {
        if (!function.isDeclaration()) {
            // Condition Id -> Condition String
            std::map<int, std::string> condIdStrMap;
            // Condition Id
            int condId = 0;
            // BasicBlock Name -> Block Condition Ids
            std::map<std::string, std::set<int>> blockCondsMap;
            // Condition sets derived from same origin Conditional Syntax
            std::list<std::set<int>> condSetSameOrigin;

            for (llvm::BasicBlock &block: function) {
                std::string currentBlockName = block.getName().data();

                // remove unnecessary block conditions
                if (blockCondsMap.find(currentBlockName) == blockCondsMap.end()) {
                    blockCondsMap[currentBlockName] = std::set<int>();
                }
                std::list<int> allBlockConditions;
                for (int i: blockCondsMap[currentBlockName]) {
                    allBlockConditions.push_back(i);
                }
                for (int i: allBlockConditions) {
                    for (std::set<int> s: condSetSameOrigin) {
                        if (s.find(i) != s.end() &&
                            std::all_of(s.begin(), s.end(),
                                        [&](int x) { return blockCondsMap[currentBlockName].find(x) !=
                                                     blockCondsMap[currentBlockName].end(); }))  {
                            std::for_each(s.begin(), s.end(), 
                                          [&](int x) { blockCondsMap[currentBlockName].erase(x); });
                        }
                    }
                }
                std::string blockConditionStr;
                if (blockCondsMap[currentBlockName].empty()) {
                    blockConditionStr = "N/A";
                } else {
                    for (int i: blockCondsMap[currentBlockName]) {
                        blockConditionStr += condIdStrMap[i] + "OR";
                    }
                    blockConditionStr.resize(blockConditionStr.length() - 2);
                }

                for (llvm::Instruction &instr: block) {
                    switch (instr.getOpcode()) {
                        case llvm::Instruction::Switch: {
                                llvm::SwitchInst &switchInst = 
                                    static_cast<llvm::SwitchInst &>(instr);
                                std::list<int> condIds;
                                for (auto &llvmCase: switchInst.cases()) {
                                    // cases
                                    std::string condStr = "(";
                                    condStr += switchInst.getOperand(0)->getName().data();
                                    condStr += "=";
                                    condStr += llvmCase.getCaseValue()->getZExtValue();
                                    condStr += + ")";
                                    condIdStrMap[condId] = condStr;
                                    condIds.push_back(condId);
                                    std::string blockName = llvmCase.getCaseSuccessor()->getName().data();
                                    if (blockCondsMap.find(blockName) != blockCondsMap.end()) {
                                        blockCondsMap[blockName].insert(condId);
                                    } else {
                                        blockCondsMap[blockName] = std::set<int>{condId};
                                    }
                                    condId++;
                                }
                                {
                                    // default
                                    std::string condStr = "(NOT(";
                                    for (int i: condIds) {
                                        condStr += condIdStrMap[i];
                                        condStr += "OR";
                                    }
                                    condStr.resize(condStr.length() - 2);
                                    condStr += "))";
                                    condIdStrMap[condId] = condStr;
                                    condIds.push_back(condId);
                                    std::string blockName = switchInst.getDefaultDest()->getName().data();
                                    if (blockCondsMap.find(blockName) != blockCondsMap.end()) {
                                        blockCondsMap[blockName].insert(condId);
                                    } else {
                                        blockCondsMap[blockName] = std::set<int>{condId};
                                    }
                                    condId++;
                                }
                            }
                            break;
                        case llvm::Instruction::Call: {
                                llvm::CallBase &callBase =
                                    static_cast<llvm::CallBase &>(instr);
                                std::cout << blockConditionStr << "\t" <<
                                    callBase.getCalledFunction()->getName().data() << 
                                    std::endl;
                            }
                            break;
                        default:
                            break;
                    }
                }
            }
        }
    }

    return 0;
}

ネストしたswitchには対応していません。ちょっといじれば対応できる作りにはしてありますが。
で、実行結果はこんな感じです。

N/A	getParam
(=x01)	hoge
(=x02)	newHoge
(NOT((=x01)OR(=x02)))	piyo
(NOT((=x01)OR(=x02)))	poyo
N/A	foo
N/A	bar

作りは正直かなり雑です。switchで分岐する各BasicBlockごとに条件を割り振っているのですが、その合流後の消し方も同一制御構文から発生した条件がすべてそろっている場合は消す、という方法をとっています。条件の書き方もデータ構造を用意するのではなくstringでベタベタに書いています。そして、見て分かりますが判断に使用したオペランドの名前を取れていません。これはどうやらそういうもののようです。(LLVM get operand and lvalue name of an instruction - Stack Overflow)

今後についてはこれを継続するかどうか悩んでいます。これ以上進むとなるとまぁ、全Instructionの分析が必要になりますよね……私の実業務で困ったことではあるものの、そこまで労力をつぎこんでいいものかどうか……

参考にしました

tomo-wait-for-it-yuki.hatenablog.com