Basic source-to-source transformation with Clang

I just downloaded LLVM and CLANG and trying to write a basic
source-to-source transformation tool.

My requirement is to transform the below sample code:

int inc(int& p)
{
        p++;
        printf("In inc [%d]\n", p);
        return p;
}
int main()
{
        int i = 0;
        int y,z;
        if(y == 0)
                print(inc(i) , inc(i));
        else
        {
                print(inc(i) , inc(i));
        }
        printf("y = [%d] z = [%d]\n", y , z);
        return 0;
}

To:

int inc(int& p)
{
        p++;
        printf("%s %d", __FILE__, __LINE__);
        printf("In inc [%d]\n", p);
        printf("%s %d", __FILE__, __LINE__);
        return p;
}

int main()
{
        int i = 0;
        printf("%s %d", __FILE__, __LINE__);
        int y,z;
        printf("%s %d", __FILE__, __LINE__);
        if(y == 0)
                print(inc(i) , inc(i));
        else
        {
                print(inc(i) , inc(i));
                printf("%s %d", __FILE__, __LINE__);
        }
        printf("y = [%d] z = [%d]\n", y , z);
        printf("%s %d", __FILE__, __LINE__);
        return 0;
}

This what I am able to develop till now via Internet Help:

#include <cstdio>
#include <string>
#include <sstream>

#include "clang/AST/ASTConsumer.h"
#include "clang/AST/RecursiveASTVisitor.h"
#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/FileManager.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Basic/TargetOptions.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Parse/ParseAST.h"
#include "clang/Rewrite/Rewriter.h"
#include "clang/Rewrite/Rewriters.h"
#include "llvm/Support/Host.h"
#include "llvm/Support/raw_ostream.h"

using namespace clang;
using namespace std;

// By implementing RecursiveASTVisitor, we can specify which AST nodes
// we're interested in by overriding relevant methods.
class MyASTVisitor : public RecursiveASTVisitor<MyASTVisitor>
{
public:
    MyASTVisitor(Rewriter &R)
        : TheRewriter(R)
    {}

    bool VisitStmt(Stmt *s) {
   if (isa<CompoundStmt>(s)) {
    CompoundStmt *Statement = cast<CompoundStmt>(s);
    TheRewriter.InsertText(Statement->getLocStart(),
             "printf(\"%s %d\", __FILE__, __LINE__);\n",
             true, true);
    }

        return true;
    }

    bool VisitFunctionDecl(FunctionDecl *f) {
        // Only function definitions (with bodies), not declarations.
        if (f->hasBody()) {
            Stmt *FuncBody = f->getBody();

            // Type name as string
            QualType QT = f->getResultType();
            string TypeStr = QT.getAsString();

            // Function name
            DeclarationName DeclName = f->getNameInfo().getName();
            string FuncName = DeclName.getAsString();

            // Add comment before
            stringstream SSBefore;
            SSBefore << "// Begin function " << FuncName << " returning "
                     << TypeStr << "\n";
            SourceLocation ST = f->getSourceRange().getBegin();
            TheRewriter.InsertText(ST, SSBefore.str(), true, true);

            // And after
            stringstream SSAfter;
            SSAfter << "\n// End function " << FuncName << "\n";
            ST = FuncBody->getLocEnd().getLocWithOffset(1);
            TheRewriter.InsertText(ST, SSAfter.str(), true, true);
        }

        return true;
    }

private:
    void AddBraces(Stmt *s);

    Rewriter &TheRewriter;
};

// Implementation of the ASTConsumer interface for reading an AST produced
// by the Clang parser.
class MyASTConsumer : public ASTConsumer
{
public:
    MyASTConsumer(Rewriter &R)
        : Visitor(R)
    {}

    // Override the method that gets called for each parsed top-level
    // declaration.
    virtual bool HandleTopLevelDecl(DeclGroupRef DR) {
        for (DeclGroupRef::iterator b = DR.begin(), e = DR.end();
             b != e; ++b)
            // Traverse the declaration using our AST visitor.
            Visitor.TraverseDecl(*b);
        return true;
    }

private:
    MyASTVisitor Visitor;
};

int main(int argc, char *argv[])
{
    if (argc != 2) {
        llvm::errs() << "Usage: rewritersample <filename>\n";
        return 1;
    }

    // CompilerInstance will hold the instance of the Clang compiler for us,
    // managing the various objects needed to run the compiler.
    CompilerInstance TheCompInst;
    TheCompInst.createDiagnostics(0, 0);

    // Initialize target info with the default triple for our platform.
    TargetOptions TO;
    TO.Triple = llvm::sys::getDefaultTargetTriple();
    TargetInfo *TI = TargetInfo::CreateTargetInfo(
        TheCompInst.getDiagnostics(), TO);
    TheCompInst.setTarget(TI);

    TheCompInst.createFileManager();
    FileManager &FileMgr = TheCompInst.getFileManager();
    TheCompInst.createSourceManager(FileMgr);
    SourceManager &SourceMgr = TheCompInst.getSourceManager();
    TheCompInst.createPreprocessor();
    TheCompInst.createASTContext();

    // A Rewriter helps us manage the code rewriting task.
    Rewriter TheRewriter;
    TheRewriter.setSourceMgr(SourceMgr, TheCompInst.getLangOpts());

    // Set the main file handled by the source manager to the input file.
    const FileEntry *FileIn = FileMgr.getFile(argv[1]);
    SourceMgr.createMainFileID(FileIn);
    TheCompInst.getDiagnosticClient().BeginSourceFile(
        TheCompInst.getLangOpts(),
        &TheCompInst.getPreprocessor());

    // Create an AST consumer instance which is going to get called by
    // ParseAST.
    MyASTConsumer TheConsumer(TheRewriter);

    // Parse the file to AST, registering our consumer as the AST consumer.
    ParseAST(TheCompInst.getPreprocessor(), &TheConsumer,
             TheCompInst.getASTContext());

    // At this point the rewriter's buffer should be full with the rewritten
    // file contents.
    const RewriteBuffer *RewriteBuf =
        TheRewriter.getRewriteBufferFor(SourceMgr.getMainFileID());
    llvm::outs() << string(RewriteBuf->begin(), RewriteBuf->end());

    return 0;
}

Below is the output I receive:

llvm/build/Release+Asserts/examples 1224> rewritersample test.cpp
test.cpp:1:12: error: expected ')'
int inc(int& p)
           ^
test.cpp:1:8: note: to match this '('
int inc(int& p)
       ^
test.cpp:1:12: error: parameter name omitted
int inc(int& p)
           ^
test.cpp:3:2: error: use of undeclared identifier 'p'
        p++;
        ^
test.cpp:4:26: error: use of undeclared identifier 'p'
        printf("In inc [%d]\n", p);
                                ^
test.cpp:5:9: error: use of undeclared identifier 'p'
        return p;
               ^
// Begin function inc returning int
int inc(int& p)
printf("%s %d", __FILE__, __LINE__);
{
   p++;
   printf("In inc [%d]\n", p);
   return p;
}
// End function inc

// Begin function main returning int
int main()
printf("%s %d", __FILE__, __LINE__);
{
   int i = 0;
   int y,z;
   if(y == 0)
    print(inc(i) , inc(i));
   else
    print(inc(i) , inc(i));
   printf("y = [%d] z = [%d]\n", y , z);
   return 0;
}
// End function main

Please let me know how
a) Stop the generation of 'error'(s)
b) Add statements in compound block as suggested above

Lastly it is a great tool to use. How should one get expertise using
llvm, clang as there are very less tutorials and examples on the same?

Regards,
Prakash

You're compiling the code as C, probably because you're completely bypassing all the driver logic.

John.

Thanks for your response but I am using g++ compiler:

g++ rewritersample.cpp -fno-rtti -D__STDC_LIMIT_MACROS
-D__STDC_CONSTANT_MACROS -o rewritersample \
                -Illvm/build/include <paths>
-Lllvm/build/Release+Asserts/lib(s)

Is there any compiler option that I need to add?

Please let me know how to overcome the problem?

Thanks in advance.

Regards,
Prakash

I meant that the clang instance your program creates is compiling its input as C,
not that you are compiling your program as C.

John.

Thanks John for the clarification.

Can you please let me know how to then create clang instance for C++ input file?

Also I am trying to write a code that would detect regular statements"

    bool VisitStmt(Stmt *s) {
        if (isa<CompoundStmt>(s)) {
                CompoundStmt *S = cast<CompoundStmt>(s);
                for (CompoundStmt::body_iterator I = S->body_begin(),
E = S->body_end(); I != E; ++I)
                {
                        Stmt *i = cast<Stmt>(*I);
                        std::cout << i->getStmtClassName();
                }
        }
      }

For a sample C++ code like below:

int main()
{
        int i = 0;
        int y,z;
        printf("y = [%d] z = [%d]\n", y , z); //regular code
        return 0;
}

But it is not giving any output? So how do we detect what typeof
statement we are processing?

Thanks in advance.

Regards,
Prakash

I suggest looking at some of the example code to see how it initializes the
CompilerInstance. clang-interpreter is an example of this.

Alternatively, you might be happier just running as a clang plugin, in
which case you should look at the PrintFunctionNames example.

John.

Thanks John for the clarification.

Can you please let me know how to then create clang instance for C++ input file?

I suggest looking at some of the example code to see how it initializes the
CompilerInstance. clang-interpreter is an example of this.

Alternatively, you might be happier just running as a clang plugin, in
which case you should look at the PrintFunctionNames example.

For source-2-source translation, there are more alternatives with different trade-offs to consider:
http://clang.llvm.org/docs/Tooling.html

Cheers,
/Manuel

Thanks to all for the response.

I was able to make a progress at least remove the C++ errors but one
issue remains - identifying the include path for C++ header file. I
now get output as :

fatal error: 'aio.h' file not found
#include <aio.h>

// Begin function inc returning int
int inc(int &p)
{
        p++;
        printf("In inc [%d]\n", p);
        return p;
}
// End function inc
……………………………………………………

The file aio.h is in /usr/include directory of my system:
llvm/build/Release+Asserts/examples> ls -l /usr/include/aio.h
-rw-r--r-- 1 root root 7135 Jul 17 2009 /usr/include/aio.h

My code modifications are:

    LangOptions languageOptions;
    languageOptions.CPlusPlus= 1;
    TheCompInst.createFileManager();
    FileManager &FileMgr = TheCompInst.getFileManager();
    TheCompInst.createSourceManager(FileMgr);
    CompilerInvocation* CI = new CompilerInvocation;
    CI->setLangDefaults(languageOptions, IK_CXX);
    TheCompInst.setInvocation(CI);
    DiagnosticsEngine DiagnosticsEngine = TheCompInst.getDiagnostics();
    HeaderSearch headerSearch(FileMgr, DiagnosticsEngine, languageOptions, TI);
    HeaderSearchOptions headerSearchOptions;
    headerSearchOptions.AddPath("/usr/include/c++/4.1.2/backward/",
frontend::Angled, false, false, false, false, false);
    headerSearchOptions.AddPath("/usr/include/c++/4.1.2/",
frontend::Angled, false, false, false, false, false);
    headerSearchOptions.AddPath("/usr/include/c++/", frontend::Angled,
false, false, false, false, false);
    headerSearchOptions.AddPath("/usr/include/", frontend::Angled,
false, false, false, false, false);
    Preprocessor preprocessor(DiagnosticsEngine, languageOptions, TI ,
SourceMgr, headerSearch, TheCompInst);
    TheCompInst.setPreprocessor(&preprocessor);

I am not sure what I need to do now to remove this error? Though I
have not yet tried libtooling library but since I have explored CLANG
using Frontend APIs I am proceeding with the same.

Thanks gain in advance.

Regards,
Prakash

Thanks to all for the response.

I was able to make a progress at least remove the C++ errors but one
issue remains - identifying the include path for C++ header file. I
now get output as :

fatal error: ‘aio.h’ file not found
#include <aio.h>

// Begin function inc returning int

int inc(int &p)
{
p++;

printf(“In inc [%d]\n”, p);
return p;
}
// End function inc

……………………………………………………

The file aio.h is in /usr/include directory of my system:
llvm/build/Release+Asserts/examples> ls -l /usr/include/aio.h
-rw-r–r-- 1 root root 7135 Jul 17 2009 /usr/include/aio.h

My code modifications are:

LangOptions languageOptions;
languageOptions.CPlusPlus= 1;

TheCompInst.createFileManager();
FileManager &FileMgr = TheCompInst.getFileManager();
TheCompInst.createSourceManager(FileMgr);

CompilerInvocation* CI = new CompilerInvocation;
CI->setLangDefaults(languageOptions, IK_CXX);
TheCompInst.setInvocation(CI);
DiagnosticsEngine DiagnosticsEngine = TheCompInst.getDiagnostics();
HeaderSearch headerSearch(FileMgr, DiagnosticsEngine, languageOptions, TI);
HeaderSearchOptions headerSearchOptions;
headerSearchOptions.AddPath("/usr/include/c++/4.1.2/backward/",
frontend::Angled, false, false, false, false, false);
headerSearchOptions.AddPath("/usr/include/c++/4.1.2/",
frontend::Angled, false, false, false, false, false);
headerSearchOptions.AddPath("/usr/include/c++/", frontend::Angled,
false, false, false, false, false);
headerSearchOptions.AddPath("/usr/include/", frontend::Angled,
false, false, false, false, false);
Preprocessor preprocessor(DiagnosticsEngine, languageOptions, TI ,
SourceMgr, headerSearch, TheCompInst);
TheCompInst.setPreprocessor(&preprocessor);

I am not sure what I need to do now to remove this error? Though I
have not yet tried libtooling library but since I have explored CLANG
using Frontend APIs I am proceeding with the same.

Yea, getting the include path search logic right & correctly setup is not trivial. That’s exactly what the tooling library takes care of for you :slight_smile: If you want to duplicate that logic, you’re of course free to, but I would guess it’ll be more time than switching your code to use the tooling library…

Cheers,
/Manuel

I tried to use “Tooling” but was unable to specify multiple headers searching directories, redefine preprocessor and so on - as I understood, “Tooling” exposes pretty narrow interface.
Prakash, I think this example will be helpful for you: http://eli.thegreenplace.net/2012/06/08/basic-source-to-source-transformation-with-clang/

2012/6/25 Slav <slavmfm@gmail.com>

I tried to use “Tooling” but was unable to specify multiple headers searching directories, redefine preprocessor and so on - as I understood, “Tooling” exposes pretty narrow interface.

LibTooling allows you to hand in any command line parameters. In fact, you should be able to use the exact same command line used for your compilation (CompilationDatabase does that for example, but you can also do that manually) if you append it after ‘–’ to the call.

You cannot exchange the preprocessor, but you can register PPCallbacks if you just want to know what’s going on (which for me always was enough for source to source translations).

Also, please feel free to file bugs and assign them to me if you have feature requests for LibTooling :slight_smile:

Cheers,
/Manuel

I made some progress with whatever support I received from this
discussion forum. Thanks to all who responded to my request. My
requirement now is to find a way I can print CLANG AST code to C++ / C
human readable statements:

    bool VisitStmt(Stmt *s) {
        if (isa<CompoundStmt>(s)) {
                CompoundStmt *S = cast<CompoundStmt>(s);
                for (CompoundStmt::body_iterator I = S->body_begin(),
E = S->body_end(); I != E; ++I)
                {
                        Stmt *i = cast<Stmt>(*I);
                        //How can I print the C++ / C statement which
CLANG is processing
                }
        }

        return true;
    }

Regards,
Prakash

I made some progress with whatever support I received from this
discussion forum. Thanks to all who responded to my request. My
requirement now is to find a way I can print CLANG AST code to C++ / C
human readable statements:

bool VisitStmt(Stmt *s) {
if (isa(s)) {
CompoundStmt *S = cast(s);
for (CompoundStmt::body_iterator I = S->body_begin(),
E = S->body_end(); I != E; ++I)
{
Stmt *i = cast(*I);

//How can I print the C++ / C statement which
CLANG is processing

You can use i->dumpAll() if you want to see what’s going on.

Cheers,
/Manuel

Thanks for the information as provided, but I get output like below:

(CallExpr 0x1a85c480 'int'
  (ImplicitCastExpr 0x1a85c468 'int (*)(const char *restrict, ...)'
<FunctionToPointerDecay>
    (DeclRefExpr 0x1a85c3e8 'int (const char *restrict, ...)' lvalue
Function 0x1a84e8e0 'printf' 'int (const char *restrict, ...)'))
  (ImplicitCastExpr 0x1a85c4b8 'const char *' <ArrayToPointerDecay>
    (StringLiteral 0x1a85c388 'const char [13]' lvalue "In inc [%d]\n"))
  (ImplicitCastExpr 0x1a85c4d0 'int' <LValueToRValue>
    (DeclRefExpr 0x1a85c3c0 'int' lvalue ParmVar 0x1a85c100 'p' 'int &')))

Kindly note that my requirement is to print the C++ code itself like
dumpAll() method generated for above for below C++ code:

p++;

Similarly for other like:

dumpAll()

(CallExpr 0x1a85dec8 'int'
  (ImplicitCastExpr 0x1a85deb0 'int (*)(const char *restrict, ...)'
<FunctionToPointerDecay>
    (DeclRefExpr 0x1a85de88 'int (const char *restrict, ...)' lvalue
Function 0x1a84e8e0 'printf' 'int (const char *restrict, ...)'))
  (ImplicitCastExpr 0x1a85df08 'const char *' <ArrayToPointerDecay>
    (StringLiteral 0x1a85cdd8 'const char [19]' lvalue "y = [%d] z = [%d]\n"))
  (ImplicitCastExpr 0x1a85df20 'int' <LValueToRValue>
    (DeclRefExpr 0x1a85ce18 'int' lvalue Var 0x1a85c6f0 'y' 'int'))
  (ImplicitCastExpr 0x1a85df38 'int' <LValueToRValue>
    (DeclRefExpr 0x1a85de60 'int' lvalue Var 0x1a85c760 'z' 'int')))

While C++ Code:

printf("y = [%d] z = [%d]\n", y , z);

Thanks in advance.

Regards,
Prakash

Thanks for the information as provided, but I get output like below:

(CallExpr 0x1a85c480 ‘int’
(ImplicitCastExpr 0x1a85c468 ‘int (*)(const char *restrict, …)’

(DeclRefExpr 0x1a85c3e8 ‘int (const char *restrict, …)’ lvalue
Function 0x1a84e8e0 ‘printf’ ‘int (const char *restrict, …)’))
(ImplicitCastExpr 0x1a85c4b8 ‘const char *’
(StringLiteral 0x1a85c388 ‘const char [13]’ lvalue “In inc [%d]\n”))
(ImplicitCastExpr 0x1a85c4d0 ‘int’
(DeclRefExpr 0x1a85c3c0 ‘int’ lvalue ParmVar 0x1a85c100 ‘p’ ‘int &’)))

Kindly note that my requirement is to print the C++ code itself like
dumpAll() method generated for above for below C++ code:

If you want to print the C++ code that produce the AST, you’ll want to get the SourceRange, and use SourceManager’s getCharacterData() method to get the underlying data - that will get you the C++ code as it was written by the user.

Cheers,
/Manuel

Thanks once again. I am really thankful to Manuel and others in group
who gave me more insight and confidence to work with CLANG.

However please note that when I used the below code to print the C++
code (input as an input param : rewritecode test.cpp):

                        Stmt *i = cast<Stmt>(*I);
                        SourceManager &SourceMgr =
TheCompInst.getSourceManager();
                        SourceLocation ST = i->getSourceRange().getBegin();
                        const char * code = SourceMgr.getCharacterData(ST);
                        std::cout << "STMT : " << code << "\n";

Ii prints additional line than what I want each time. Example if my
test.cpp file is like below:

int inc(int &p)
{
        p++;
        printf("%s %d",__FILE__,__LINE__);
        printf("In inc [%d]\n", p);
        return p;
}

I get output like:

STMT : p++;
        printf("In inc [%d]\n", p);
        return p;
}
......................................................

How can I print only a single line of code at a time and not its
consecutive one. I cannot tokenize it with '\n' as there are can
multiple lines for a single statement. I would also really appreciate
if I can get some reference of good books either offline / online to
refer for beginners level.

Thanks in advance.

Regards,
Prakash

Thanks once again. I am really thankful to Manuel and others in group
who gave me more insight and confidence to work with CLANG.

However please note that when I used the below code to print the C++
code (input as an input param : rewritecode test.cpp):

Stmt *i = cast(*I);

SourceManager &SourceMgr =
TheCompInst.getSourceManager();
SourceLocation ST = i->getSourceRange().getBegin();
const char * code = SourceMgr.getCharacterData(ST);
std::cout << "STMT : " << code << “\n”;

Do the same for getEnd(), then you have a char * for the end, and you can print to that character…

Cheers,
/Manuel

I got one way to do that, not sure if correct:

                        Stmt *i = cast<Stmt>(*I);
                        SourceManager &SourceMgr =
TheCompInst.getSourceManager();
                        SourceLocation ST = i->getSourceRange().getBegin();
                        SourceLocation ED = i->getLocEnd().getLocWithOffset(4);
                        const char * scode = SourceMgr.getCharacterData(ST);
                        const char * ecode = SourceMgr.getCharacterData(ED);
                        char *code = (char *)calloc(ecode - scode + 1,
sizeof(char));
                        strncpy(code, scode, ecode - scode);
                        std::cout << "STMT :" << code << "\n";

But I found that if, for or while etc which can have sub blocks the
inside a main compound block - how do I traverse through them. The
following code does not works:

if (isa<ForStmt>(i)) {
                for (CompoundStmt::body_iterator x = i->body_begin(),
y = i->body_end(); x != y; ++x)
                {

                }
   }
Compilation Error:

rewritersample.cpp:59: error: 'class clang::Stmt' has no member named
'body_begin'
rewritersample.cpp:59: error: 'class clang::Stmt' has no member named 'body_end'

Please help and thanks in advance.

Regards,
Prakash

I got one way to do that, not sure if correct:

Stmt *i = cast(*I);
SourceManager &SourceMgr =
TheCompInst.getSourceManager();
SourceLocation ST = i->getSourceRange().getBegin();

SourceLocation ED = i->getLocEnd().getLocWithOffset(4);
const char * scode = SourceMgr.getCharacterData(ST);
const char * ecode = SourceMgr.getCharacterData(ED);
char *code = (char *)calloc(ecode - scode + 1,
sizeof(char));
strncpy(code, scode, ecode - scode);

std::cout << “STMT :” << code << “\n”;

But I found that if, for or while etc which can have sub blocks the
inside a main compound block - how do I traverse through them. The
following code does not works:

if (isa(i)) {

You’ll want
if (ForStmt *S = dyn_cast(i)) {
… use S …
}