공부/Python

[C 함수 파싱] clang library

래울 2025. 2. 10. 00:38

코드를 바이너리 코드로 변환할 때 사용하는 llvm기반으로 C함수 파싱

 

LLVM-<version>-win64.exe 다운로드

https://github.com/llvm/llvm-project/releases

 

Releases · llvm/llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies. - llvm/llvm-project

github.com

 

lib clang docs

https://libclang.readthedocs.io/en/latest/_modules/clang/cindex.html

 

example code

dir structure

import clang.cindex
import re
import os

# Windows에서 libclang.dll 경로 설정 (설치 경로에 맞게 수정)
clang.cindex.Config.set_library_file("C:/Program Files/LLVM/bin/libclang.dll")
clang_index = clang.cindex.Index.create()
print("libclang.dll load success.")

project_functions = dict()


def preprocess_file(input_file, output_file):
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            c_code = f.read()
    except FileExistsError as e:
        print(f"{e}")

    # 주석 제거
    c_code = re.sub(r'//.*', '', c_code)
    c_code = re.sub(r'/\*.*?\*/', '', c_code, flags=re.DOTALL)

    # 줄 단위로 분할 후 #include가 포함된 줄 제거
    preprocessed_code = "\n".join(line for line in c_code.split("\n") if not line.strip().startswith("#include"))

    # 새로운 파일에 저장
    try:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(preprocessed_code)
    except Exception as e:
        print(f"{e}")


def parse_functions_info(file_path):
    tu = clang_index.parse(file_path)

    functions = []
    for node in tu.cursor.get_children():
        if node.kind == clang.cindex.CursorKind.FUNCTION_DECL:
            func_name = node.spelling   #함수이름
            func_start = node.extent.start.line
            func_end = node.extent.end.line
            functions.append((func_name, func_start, func_end))

    return functions


def parse_functions_code(file_path, functions):
    print(f"extract_code: {file_path}")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            c_code = f.readlines()
    except Exception as e:
        print(f"{e}")
    
    for func_name, start, end in functions:
        func_code = "".join(c_code[start-1:end])
        func_key = os.path.basename(file_path) + '_' + func_name
        project_functions[func_key] = func_code
        # print(f"Function: {func_name}\n{func_code}\n{'-'*40}")  # </hr>


if __name__ == "__main__":
    # example for project A
    file_path_list = [
        "sources/test.c",
        "sources/test2.c",
    ]

    project_name = 'A'
    project_dir_path = f"result/{project_name}"
    os.makedirs(project_dir_path, exist_ok=True)
    for file_path in file_path_list:
        # 주석 제거, 헤더 제거, save to <project name>/<file.c>.
        preprocess_file(file_path, f"{project_dir_path}/{os.path.basename(file_path)}")

    for root, _, file_paths in os.walk(project_dir_path):
        for file_path in file_paths:
            curr_file_path = f"{root}/{file_path}"
            # print(f"{curr_file_path}: ")
            functions = parse_functions_info(curr_file_path)
            parse_functions_code(curr_file_path, functions)

    print(project_functions)