#include <string>
#include <vector>
#include <fstream>
#include <filesystem>
#include "clang/AST/ASTImporter.h"
#include "clang/AST/ASTImporterSharedState.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/Tooling/Tooling.h"
// Access Matched Nodes Information
#include "clang/AST/ASTContext.h"
using namespace clang::tooling;
using namespace llvm;
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/raw_ostream.h"
using namespace clang;
using namespace clang::ast_matchers;
std::string trim(const std::string &s) {
auto start = s.begin();
while (start != s.end() && std::isspace(*start)) {
start++;
}
auto end = s.end();
do {
end--;
} while (std::distance(start, end) > 0 && std::isspace(*end));
return std::string(start, end + 1);
}
std::vector<std::string> parseProgramPaths(const std::string& input) {
if(input.rfind("//program", 0) != 0){ //be still sceptical about valid format, or atleast, unintended collision
return std::vector<std::string>(); // return empty vector if not present
}
std::vector<std::string> result;
// Remove the initial "//"
std::string paths = input.substr(2);
// Create a stringstream from the paths string
std::stringstream ss(paths);
std::string path;
// Split by comma and trim each path
while (std::getline(ss, path, ',')) {
result.push_back(trim(path));
}
return result;
}
std::string getLastLine(const std::string& filePath) {
std::ifstream file(filePath, std::ios::in);
if (!file.is_open()) {
return "";
}
std::string line;
std::string lastLine;
while (std::getline(file, line)) {
lastLine = line;
}
file.close();
return lastLine;
}
//Gets immediate repositories
void getImmediateDirectories(const std::string &directory, std::vector<std::string> &files){
std::error_code ec;
for (llvm::sys::fs::directory_iterator dirIter(directory, ec), dirEnd;
dirIter != dirEnd && !ec;
dirIter.increment(ec)) {
if (llvm::sys::fs::is_directory(dirIter->path())) {
files.push_back(dirIter->path());
}
/*if (llvm::sys::fs::is_directory(dirIter->path())){
files.push_back(dirIter->path());
}*/
if (ec) {
llvm::errs() << "Error accessing directory: " << ec.message() << "\n";
}
}
}
void getIncludeDirectoriesInRepo(const std::string &directory,std::vector<std::string> &extraIncludeDirectories) {
std::error_code ec;
for (llvm::sys::fs::recursive_directory_iterator dirIter(directory, ec), dirEnd;
dirIter != dirEnd && !ec;
dirIter.increment(ec)) {
if (llvm::sys::fs::is_directory(dirIter->path())) {
extraIncludeDirectories.push_back(dirIter->path());
}
if (ec) {
llvm::errs() << "Error accessing directory: " << ec.message() << "\n";
}
}
}
void getMapInformationInRepo(const std::string &directory, std::map<std::string, std::map<std::string, std::vector<std::string>>> &myMap) {
std::error_code ec;
for (llvm::sys::fs::recursive_directory_iterator dirIter(directory, ec), dirEnd;
dirIter != dirEnd && !ec;
dirIter.increment(ec)) {
if (llvm::sys::fs::is_regular_file(dirIter->path())) {
std::string lastLine = getLastLine(dirIter->path());
std::vector<std::string> paths = parseProgramPaths(lastLine);
for (const std::string& program_path : paths) {
myMap[directory][program_path].push_back(dirIter->path());
}
//std::vector<std::string> &files as argument, and files.push_back(dirIter->path());
}
/*if (llvm::sys::fs::is_directory(dirIter->path())){
files.push_back(dirIter->path());
}*/
if (ec) {
llvm::errs() << "Error accessing directory: " << ec.message() << "\n";
}
}
}
template <typename Node, typename Matcher>
Node *getFirstDecl(Matcher M, const std::unique_ptr<ASTUnit> &Unit) {
auto MB = M.bind("bindStr"); // Bind the to-be-matched node to a string key.
auto MatchRes = match(MB, Unit->getASTContext());
// We should have at least one match.
assert(MatchRes.size() >= 1);
// Get the first matched and bound node.
Node *Result =
const_cast<Node *>(MatchRes[0].template getNodeAs<Node>("bindStr"));
assert(Result);
return Result;
}
//temporary function to print our Maps
/*
Map has the following format:
{
"pathRepository1":
{
"program0": ["path1.c", "path2.c", ...], //these c files are compiled together. We want to merge these into one AST
"program1" : ["path5.c", "path1.c", ...], //per repository there might be multiple programs
...
},
...
}
*/
Error writeToOutputErr(StringRef OutputFileName,
std::function<Error(raw_ostream &)> Write) {
if (OutputFileName == "-")
return Write(llvm::errs());
if (OutputFileName == "/dev/null") {
raw_null_ostream Out;
return Write(Out);
}
unsigned Mode = sys::fs::all_read | sys::fs::all_write;
Expected<sys::fs::TempFile> Temp =
sys::fs::TempFile::create(OutputFileName + ".temp-stream-%%%%%%", Mode);
if (!Temp)
return createFileError(OutputFileName, Temp.takeError());
std::error_code EC;
raw_fd_ostream Out(Temp->FD, false, true); //added third argument to true, improves a little bit?
if (Error E = Write(Out)) {
if (Error DiscardError = Temp->discard())
return joinErrors(std::move(E), std::move(DiscardError));
return E;
}
Out.flush();
return Temp->keep(OutputFileName);
}
bool Save(const std::unique_ptr<ASTUnit> &Unit , StringRef File) {
if (llvm::Error Err = writeToOutputErr( //instead of llvm::writeToOutput we use own adapted to errs()
File, [&Unit](llvm::raw_ostream &Out) { //experiment here with the IO stream to avoid "Segmentation fault (core dumped)"
//raw_ostream is default
return Unit->serialize(Out) ? llvm::make_error<llvm::StringError>( //default Out x -> llvm::errs() x
"ASTUnit serialization failed",
llvm::inconvertibleErrorCode())
: llvm::Error::success();
})) {
consumeError(std::move(Err));
return true;
}
return false;
}
void mergeASTsAndSave(const std::map<std::string, std::map<std::string, std::vector<std::string>>> &myMap) {
std::vector<std::string> IncludeDirs = {"-fsyntax-only", "-Wfatal-errors","-Wno-implicit-int","-Wno-int-conversion", "-Wno-deprecated-declarations", "-Wno-implicit-function-declaration","-Wno-unused-value"
,"-I/home/toge/clang-llvm/lib/clang/19/include/","-I/usr/include","-I/usr/include/x86_64-linux-gnu", "-I/usr/local/include", "-I/usr/lib/gcc/x86_64-linux-gnu/11/include"};
for (const auto &[repoName, programs] : myMap) {
for (const auto &[programName, files] : programs) {
// Build an empty ToUnit, but give information about the path for proper include-resolving purposes.
std::unique_ptr<ASTUnit> ToUnit = buildASTFromCodeWithArgs("",IncludeDirs,"to.c");
ToUnit->enableSourceFileDiagnostics();
auto ImporterState = std::make_shared<ASTImporterSharedState>(*ToUnit->getASTContext().getTranslationUnitDecl());
for (const auto &file : files) { //Currently in one repo, at one program. Here we traverse the files of program.
llvm::outs() << "prepare file read maybe after here" << "\n";
llvm::outs() << file << "\n";
std::ifstream fileStream(file);
std::stringstream buffer;
buffer << fileStream.rdbuf();
std::string fileContent = buffer.str();
std::unique_ptr<ASTUnit> FromUnit = buildASTFromCodeWithArgs(fileContent, IncludeDirs, file);
FromUnit->enableSourceFileDiagnostics();
ASTImporter Importer(ToUnit->getASTContext(), ToUnit->getFileManager(),
FromUnit->getASTContext(), FromUnit->getFileManager(),
/*MinimalImport=*/false, ImporterState);
auto *FromTU = FromUnit->getASTContext().getTranslationUnitDecl();
for (auto *D : FromTU->decls()) {
// Don't re-import __va_list_tag, __builtin_va_list.
if (const auto *ND = dyn_cast<NamedDecl>(D))
if (IdentifierInfo *II = ND->getIdentifier())
if (II->isStr("__va_list_tag") || II->isStr("__builtin_va_list"))
continue;
llvm::Expected<Decl *> ToDOrError = Importer.Import(D);
/*if (ToDOrError) {
ToUnit->addTopLevelDecl(*ToDOrError); ?
} else {
llvm::consumeError(ToDOrError.takeError());
}*/
}
}
std::filesystem::path outputASTFile = std::filesystem::path(repoName) / (programName + ".ast");
ToUnit->Save(outputASTFile.string());
ToUnit->getASTContext().getTranslationUnitDecl()->dump(llvm::errs()); // Tree dump
ToUnit->getASTContext().getTranslationUnitDecl()->print(llvm::errs()); // raw output
}
}
}
int main(int argc, const char **argv) {
std::string targetDirectory = "C_COMPILE";
std::vector<std::string> allRepos;
std::map<std::string, std::map<std::string, std::vector<std::string>>> myMap;
std::vector<std::string> extraIncludeDirectories;
if (llvm::sys::fs::is_directory(targetDirectory)) {
getImmediateDirectories(targetDirectory, allRepos);
for(const auto ¤t_repo : allRepos){
getMapInformationInRepo(current_repo, myMap);
}
} else {
llvm::errs() << "Error: " << targetDirectory << " is not a directory.\n";
return 1;
}
mergeASTsAndSave(myMap);
return 0;
};
I have (based on ASTImporter: Merging Clang ASTs β Clang 19.0.0git documentation, and ASTMerger.cpp which is meant to merge AST files) made a tool, that can iterate over directories and merge related source files into one big AST and dump/save/print it. I have made a couple of test folder that have c files, and all c files that are related have a common comment at the end of the source file. I have uploaded some test folders if you want to experiment LLVM ASTImporter Bug - Google Drive . The tool basically traverses the directories recursively and create a map of related source files. And later loops over related source files and tries to merge them with the Importer method.
If I iterate over every file and just create independent ASTs and save/dump/print them, I have absolutely no error. (If we move the save, dump, print into the loop and change ToUnit β FromUnit).
The moment I start to merge them, say import related βfromβ contexts into a single empty βtoβ context with the Importer method, things get bizarre. E.g. the importer is not filling protypes with implementations, but rather keeps the prototypes and reimports implementations. And I get LLVM IO error or Segmentation fault (core dumped) now. Where is my mistake?