How to Merge ASTs of Different C Files

#include <string>
#include <vector>
    #include <fstream>
    #include <filesystem>

#include "clang/AST/ASTImporter.h"
#include "clang/AST/ASTImporterSharedState.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/Tooling/Tooling.h"


// Access Matched Nodes Information
#include "clang/AST/ASTContext.h"

using namespace clang::tooling;
using namespace llvm;

#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"



#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/raw_ostream.h"


using namespace clang;
using namespace clang::ast_matchers;



std::string trim(const std::string &s) {
    auto start = s.begin();
    while (start != s.end() && std::isspace(*start)) {
        start++;
    }

    auto end = s.end();
    do {
        end--;
    } while (std::distance(start, end) > 0 && std::isspace(*end));

    return std::string(start, end + 1);
}


std::vector<std::string> parseProgramPaths(const std::string& input) {
    if(input.rfind("//program", 0) != 0){ //be still sceptical about valid format, or atleast, unintended collision
      return std::vector<std::string>(); // return empty vector if not present
    }
    std::vector<std::string> result;
    
    // Remove the initial "//"
    std::string paths = input.substr(2);
    
    // Create a stringstream from the paths string
    std::stringstream ss(paths);
    std::string path;
    
    // Split by comma and trim each path
    while (std::getline(ss, path, ',')) {
        result.push_back(trim(path));
    }
    
    return result;
}

std::string getLastLine(const std::string& filePath) {
    std::ifstream file(filePath, std::ios::in);
    if (!file.is_open()) {
        return "";
    }

    std::string line;
    std::string lastLine;

    while (std::getline(file, line)) {
        lastLine = line;
    }

    file.close();
    return lastLine;
}

//Gets immediate repositories
void getImmediateDirectories(const std::string &directory, std::vector<std::string> &files){
    std::error_code ec;
    for (llvm::sys::fs::directory_iterator dirIter(directory, ec), dirEnd;
         dirIter != dirEnd && !ec;
         dirIter.increment(ec)) {
        if (llvm::sys::fs::is_directory(dirIter->path())) {
            files.push_back(dirIter->path());
        }
        /*if (llvm::sys::fs::is_directory(dirIter->path())){
          files.push_back(dirIter->path());
        }*/
        if (ec) {
            llvm::errs() << "Error accessing directory: " << ec.message() << "\n";
        }
    }
}

void getIncludeDirectoriesInRepo(const std::string &directory,std::vector<std::string> &extraIncludeDirectories) {
   std::error_code ec;
    for (llvm::sys::fs::recursive_directory_iterator dirIter(directory, ec), dirEnd;
         dirIter != dirEnd && !ec;
         dirIter.increment(ec)) {
        if (llvm::sys::fs::is_directory(dirIter->path())) {
            extraIncludeDirectories.push_back(dirIter->path());
        }
        if (ec) {
            llvm::errs() << "Error accessing directory: " << ec.message() << "\n";
        }
    }
}
void getMapInformationInRepo(const std::string &directory, std::map<std::string, std::map<std::string, std::vector<std::string>>> &myMap) {
    std::error_code ec;
    for (llvm::sys::fs::recursive_directory_iterator dirIter(directory, ec), dirEnd;
         dirIter != dirEnd && !ec;
         dirIter.increment(ec)) {
        if (llvm::sys::fs::is_regular_file(dirIter->path())) {
            std::string lastLine = getLastLine(dirIter->path());
            std::vector<std::string> paths = parseProgramPaths(lastLine);
            for (const std::string& program_path : paths) {
              myMap[directory][program_path].push_back(dirIter->path()); 
            }
            //std::vector<std::string> &files as argument, and files.push_back(dirIter->path());

        }
        /*if (llvm::sys::fs::is_directory(dirIter->path())){
          files.push_back(dirIter->path());
        }*/
        if (ec) {
            llvm::errs() << "Error accessing directory: " << ec.message() << "\n";
        }
    }
}


template <typename Node, typename Matcher>
Node *getFirstDecl(Matcher M, const std::unique_ptr<ASTUnit> &Unit) {
  auto MB = M.bind("bindStr"); // Bind the to-be-matched node to a string key.
  auto MatchRes = match(MB, Unit->getASTContext());
  // We should have at least one match.
  assert(MatchRes.size() >= 1);
  // Get the first matched and bound node.
  Node *Result =
      const_cast<Node *>(MatchRes[0].template getNodeAs<Node>("bindStr"));
  assert(Result);
  return Result;
}

//temporary function to print our Maps

/*
Map has the following format:
{
  "pathRepository1":
  {
    "program0": ["path1.c", "path2.c", ...], //these c files are compiled together. We want to merge these into one AST
    "program1" : ["path5.c", "path1.c", ...], //per repository there might be multiple programs
    ...
  },
  ...
}
*/

Error writeToOutputErr(StringRef OutputFileName,
                          std::function<Error(raw_ostream &)> Write) {
  if (OutputFileName == "-")
    return Write(llvm::errs());

  if (OutputFileName == "/dev/null") {
    raw_null_ostream Out;
    return Write(Out);
  }

  unsigned Mode = sys::fs::all_read | sys::fs::all_write;
  Expected<sys::fs::TempFile> Temp =
      sys::fs::TempFile::create(OutputFileName + ".temp-stream-%%%%%%", Mode);
  if (!Temp)
    return createFileError(OutputFileName, Temp.takeError());
  std::error_code EC;
  raw_fd_ostream Out(Temp->FD, false, true); //added third argument to true, improves a little bit?

  if (Error E = Write(Out)) {
    if (Error DiscardError = Temp->discard())
      return joinErrors(std::move(E), std::move(DiscardError));
    return E;
  }
  Out.flush();

  return Temp->keep(OutputFileName);
}

bool Save(const std::unique_ptr<ASTUnit> &Unit , StringRef File) {
  if (llvm::Error Err = writeToOutputErr( //instead of llvm::writeToOutput we use own adapted to errs()
          File, [&Unit](llvm::raw_ostream &Out) { //experiment here with the IO stream to avoid "Segmentation fault (core dumped)"
            //raw_ostream is default
            return Unit->serialize(Out) ? llvm::make_error<llvm::StringError>( //default Out x -> llvm::errs() x
                                        "ASTUnit serialization failed",
                                        llvm::inconvertibleErrorCode())
                                  : llvm::Error::success();
          })) {
    consumeError(std::move(Err));
    return true;
  }
  return false;
}
 
void mergeASTsAndSave(const std::map<std::string, std::map<std::string, std::vector<std::string>>> &myMap) {
    std::vector<std::string> IncludeDirs = {"-fsyntax-only", "-Wfatal-errors","-Wno-implicit-int","-Wno-int-conversion", "-Wno-deprecated-declarations", "-Wno-implicit-function-declaration","-Wno-unused-value"
        ,"-I/home/toge/clang-llvm/lib/clang/19/include/","-I/usr/include","-I/usr/include/x86_64-linux-gnu", "-I/usr/local/include", "-I/usr/lib/gcc/x86_64-linux-gnu/11/include"};
    for (const auto &[repoName, programs] : myMap) {
        for (const auto &[programName, files] : programs) {
            // Build an empty ToUnit, but give information about the path for proper include-resolving purposes.
            std::unique_ptr<ASTUnit> ToUnit = buildASTFromCodeWithArgs("",IncludeDirs,"to.c");
            ToUnit->enableSourceFileDiagnostics();
            auto ImporterState = std::make_shared<ASTImporterSharedState>(*ToUnit->getASTContext().getTranslationUnitDecl());
            for (const auto &file : files) { //Currently in one repo, at one program. Here we traverse the files of program.
             
                
                llvm::outs() << "prepare file read maybe after here" << "\n";
                llvm::outs() << file << "\n";
                std::ifstream fileStream(file);
                std::stringstream buffer;
                buffer << fileStream.rdbuf();
                std::string fileContent = buffer.str();
                std::unique_ptr<ASTUnit> FromUnit = buildASTFromCodeWithArgs(fileContent, IncludeDirs, file);
                FromUnit->enableSourceFileDiagnostics();

                ASTImporter Importer(ToUnit->getASTContext(), ToUnit->getFileManager(),
                                     FromUnit->getASTContext(), FromUnit->getFileManager(),
                                     /*MinimalImport=*/false, ImporterState);

                
                auto *FromTU = FromUnit->getASTContext().getTranslationUnitDecl();
                for (auto *D : FromTU->decls()) {
                  // Don't re-import __va_list_tag, __builtin_va_list.
                  if (const auto *ND = dyn_cast<NamedDecl>(D))
                    if (IdentifierInfo *II = ND->getIdentifier())
                      if (II->isStr("__va_list_tag") || II->isStr("__builtin_va_list"))
                        continue;
                    llvm::Expected<Decl *> ToDOrError = Importer.Import(D);
                    /*if (ToDOrError) {
                        ToUnit->addTopLevelDecl(*ToDOrError); ?
                    } else {
                        llvm::consumeError(ToDOrError.takeError());
                    }*/
                }
                
            }
            std::filesystem::path outputASTFile = std::filesystem::path(repoName) / (programName + ".ast");

            ToUnit->Save(outputASTFile.string());
            ToUnit->getASTContext().getTranslationUnitDecl()->dump(llvm::errs()); // Tree dump
            ToUnit->getASTContext().getTranslationUnitDecl()->print(llvm::errs()); // raw output
            
        }
    }
}
int main(int argc, const char **argv) {
    std::string targetDirectory = "C_COMPILE";
    std::vector<std::string> allRepos;
    std::map<std::string, std::map<std::string, std::vector<std::string>>> myMap;
    std::vector<std::string> extraIncludeDirectories;
    if (llvm::sys::fs::is_directory(targetDirectory)) {
      getImmediateDirectories(targetDirectory, allRepos);
      for(const auto &current_repo : allRepos){
        getMapInformationInRepo(current_repo, myMap);
      }
    } else {
      llvm::errs() << "Error: " << targetDirectory << " is not a directory.\n";
      return 1;
    }
    mergeASTsAndSave(myMap);
    return 0;
};

I have (based on ASTImporter: Merging Clang ASTs β€” Clang 19.0.0git documentation, and ASTMerger.cpp which is meant to merge AST files) made a tool, that can iterate over directories and merge related source files into one big AST and dump/save/print it. I have made a couple of test folder that have c files, and all c files that are related have a common comment at the end of the source file. I have uploaded some test folders if you want to experiment LLVM ASTImporter Bug - Google Drive . The tool basically traverses the directories recursively and create a map of related source files. And later loops over related source files and tries to merge them with the Importer method.

If I iterate over every file and just create independent ASTs and save/dump/print them, I have absolutely no error. (If we move the save, dump, print into the loop and change ToUnit β†’ FromUnit).
The moment I start to merge them, say import related β€œfrom” contexts into a single empty β€œto” context with the Importer method, things get bizarre. E.g. the importer is not filling protypes with implementations, but rather keeps the prototypes and reimports implementations. And I get LLVM IO error or Segmentation fault (core dumped) now. Where is my mistake?

1 Like