Memory Store/Load Optimization Issue (Emulating stack)

Hello,

I am trying to emulate the “stack” as like on x86 when using push/pop so afterwards I can use LLVM’s optimizer passes to simplify (reduce junk) the code.

The LLVM IR code:

define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) {
; push foo (On “stack”)
%sp_1 = sub i32 %sp, 4
%sp_1_ptr = inttoptr i32 %sp_1 to i32*
store i32 %foo, i32* %sp_1_ptr, align 4

; push bar
%sp_2 = sub i32 %sp_1, 4
%sp_2_ptr = inttoptr i32 %sp_2 to i32*
store i32 %bar, i32* %sp_2_ptr, align 4

; val1 = pop (val1 = bar)
%sp_3_ptr = inttoptr i32 %sp_2 to i32*
%val1 = load i32, i32* %sp_3_ptr, align 4
%sp_3 = add i32 %sp_2, 4

; val2 = pop (val2 = foo)
%sp_4_ptr = inttoptr i32 %sp_3 to i32*
%val2 = load i32, i32* %sp_4_ptr, align 4
%sp_4 = add i32 %sp_3, 4

%ret_1 = insertvalue { i32, i32, i32 } undef, i32 %val1, 0
%ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1
%ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp_4, 2

ret { i32, i32, i32 } %ret_3
}

This code will “push” two values onto the stack and pop them in reverse order so afterwards “foo” and “bar” will be swapped and returned back.

After running this through “opt -O2 ./test.ll”, I am getting this:

define { i32, i32, i32 } @test(i32 %foo, i32 %bar, i32 %sp) #0 {
%sp_1 = add i32 %sp, -4
%1 = zext i32 %sp_1 to i64
%sp_1_ptr = inttoptr i64 %1 to i32*
store i32 %foo, i32* %sp_1_ptr, align 4
%sp_2 = add i32 %sp, -8
%2 = zext i32 %sp_2 to i64
%sp_2_ptr = inttoptr i64 %2 to i32*
store i32 %bar, i32* %sp_2_ptr, align 4
%val2 = load i32, i32* %sp_1_ptr, align 4
%ret_1 = insertvalue { i32, i32, i32 } undef, i32 %bar, 0 ; Swapped
%ret_2 = insertvalue { i32, i32, i32 } %ret_1, i32 %val2, 1; Not Swapped (Not optimized; Should be %foo)
%ret_3 = insertvalue { i32, i32, i32 } %ret_2, i32 %sp, 2
ret { i32, i32, i32 } %ret_3
}

As you can see that the IR has got additional code, eg. zext. But the main problem here is that val2 hasn’t been optimized.
Could anyone show me some hints what is preventing the second val from being optimized? (My guess would be the zext because I am using %sp as a 32bit pointer although the “target” is 64bit).

Regards,
Paul

Two points:

  • Using inttoptr is a mistake here. GEPs are strongly preferred and provide strictly more aliasing information to the optimizer.
  • The zext is a bit weird. I’m not sure where that came from, but I’d not bother looking into until the preceding point is addressed.

In general, you may find these docs useful:
Philip

Thank you for the hint.

I adjusted the code and it works:

The code after replacing inttoptr with getelementptr:

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
entry:
; push foo (On “stack”)
%sp_1 = getelementptr i8, i8* %sp, i32 -4
%sp_1_ptr = bitcast i8* %sp_1 to i32*
store i32 %foo, i32* %sp_1_ptr, align 4

; push bar
%sp_2 = getelementptr i8, i8* %sp_1, i32 -4
%sp_2_ptr = bitcast i8* %sp_2 to i32*
store i32 %bar, i32* %sp_2_ptr, align 4

; val1 = pop (val1 = bar)
%sp_3_ptr = bitcast i8* %sp_2 to i32*
%val1 = load i32, i32* %sp_3_ptr, align 4
%sp_3 = getelementptr i8, i8* %sp_2, i32 4

; val2 = pop (val2 = foo)
%sp_4_ptr = bitcast i8* %sp_3 to i32*
%val2 = load i32, i32* %sp_4_ptr, align 4
%sp_4 = getelementptr i8, i8* %sp_3, i32 4

%ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
%ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
%ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2

ret { i32, i32, i8* } %ret_3
}

After optimization (“opt -instcombine ./code.ll -S”)

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
entry:
%sp_1 = getelementptr i8, i8* %sp, i64 -4
%sp_1_ptr = bitcast i8* %sp_1 to i32*
store i32 %foo, i32* %sp_1_ptr, align 4
%sp_2 = getelementptr i8, i8* %sp, i64 -8
%sp_2_ptr = bitcast i8* %sp_2 to i32*
store i32 %bar, i32* %sp_2_ptr, align 4
%ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
%ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
%ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
ret { i32, i32, i8* } %ret_3
}

My only questions are now:

  • How is it that inttoptr cannot provide that specific alias information so it can optimize that store/load away ?
  • Might it be possible to get inttoptr providing such alias analysis ?
  • I came across MemorySSA while browsing though the llvm source. Is it possible that one can use MemorySSA to do such optimization without alias analysis ?
  • Where do I have to look in the source which is doing this kind of optimization (Is it instcombine which uses lib/Analysis/Loads.cpp ?)

Regards,
Paul

Thank you for the hint.

I adjusted the code and it works:

The code after replacing inttoptr with getelementptr:

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
entry:
  ; push foo (On "stack")
  %sp_1 = getelementptr i8, i8* %sp, i32 -4
  %sp_1_ptr = bitcast i8* %sp_1 to i32*
  store i32 %foo, i32* %sp_1_ptr, align 4

  ; push bar
  %sp_2 = getelementptr i8, i8* %sp_1, i32 -4
  %sp_2_ptr = bitcast i8* %sp_2 to i32*
  store i32 %bar, i32* %sp_2_ptr, align 4

  ; val1 = pop (val1 = bar)
  %sp_3_ptr = bitcast i8* %sp_2 to i32*
  %val1 = load i32, i32* %sp_3_ptr, align 4
  %sp_3 = getelementptr i8, i8* %sp_2, i32 4

  ; val2 = pop (val2 = foo)
  %sp_4_ptr = bitcast i8* %sp_3 to i32*
  %val2 = load i32, i32* %sp_4_ptr, align 4
  %sp_4 = getelementptr i8, i8* %sp_3, i32 4

  %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0
  %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1
  %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2

  ret { i32, i32, i8* } %ret_3
}

After optimization ("opt -instcombine ./code.ll -S")

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {
entry:
  %sp_1 = getelementptr i8, i8* %sp, i64 -4
  %sp_1_ptr = bitcast i8* %sp_1 to i32*
  store i32 %foo, i32* %sp_1_ptr, align 4
  %sp_2 = getelementptr i8, i8* %sp, i64 -8
  %sp_2_ptr = bitcast i8* %sp_2 to i32*
  store i32 %bar, i32* %sp_2_ptr, align 4
  %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0
  %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1
  %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2
  ret { i32, i32, i8* } %ret_3
}

My only questions are now:
- How is it that inttoptr cannot provide that specific alias information
so it can optimize that store/load away ?

Because nothing tracks what happens to the ints, and what happens when they
are converted back to pointers and whether it's sane :slight_smile:

http://llvm.org/docs/GetElementPtr.html#how-is-gep-different-from-ptrtoint-arithmetic-and-inttoptr

- Might it be possible to get inttoptr providing such alias analysis ?

It doesn't make a lot of sense to try in most cases.
Most of the cases ptrtoint/inttoptr is useful are those where you want to
do crazy things to the pointer.

- I came across MemorySSA while browsing though the llvm source. Is it
possible that one can use MemorySSA to do such optimization without alias
analysis ?

MemorySSA relies on alias analysis to generate the SSA form.

- Where do I have to look in the source which is doing this kind of
optimization (Is it instcombine which uses lib/Analysis/Loads.cpp ?)

It's probably a combination of opts. The most likely candidate is -gvn,

but I would look at the pass dumps after each opt

From: "Daniel Berlin via llvm-dev" <llvm-dev@lists.llvm.org>
To: "Paul Peet" <paulpeet17@gmail.com>
Cc: "llvm-dev" <llvm-dev@lists.llvm.org>
Sent: Wednesday, February 10, 2016 2:24:01 PM
Subject: Re: [llvm-dev] Memory Store/Load Optimization Issue
(Emulating stack)

> Thank you for the hint.

> I adjusted the code and it works:

> The code after replacing inttoptr with getelementptr:

> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {

> entry:

> ; push foo (On "stack")

> %sp_1 = getelementptr i8, i8* %sp, i32 -4

> %sp_1_ptr = bitcast i8* %sp_1 to i32*

> store i32 %foo, i32* %sp_1_ptr, align 4

> ; push bar

> %sp_2 = getelementptr i8, i8* %sp_1, i32 -4

> %sp_2_ptr = bitcast i8* %sp_2 to i32*

> store i32 %bar, i32* %sp_2_ptr, align 4

> ; val1 = pop (val1 = bar)

> %sp_3_ptr = bitcast i8* %sp_2 to i32*

> %val1 = load i32, i32* %sp_3_ptr, align 4

> %sp_3 = getelementptr i8, i8* %sp_2, i32 4

> ; val2 = pop (val2 = foo)

> %sp_4_ptr = bitcast i8* %sp_3 to i32*

> %val2 = load i32, i32* %sp_4_ptr, align 4

> %sp_4 = getelementptr i8, i8* %sp_3, i32 4

> %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %val1, 0

> %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %val2, 1

> %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp_4, 2

> ret { i32, i32, i8* } %ret_3

> }

> After optimization ("opt -instcombine ./code.ll -S")

> define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp) {

> entry:

> %sp_1 = getelementptr i8, i8* %sp, i64 -4

> %sp_1_ptr = bitcast i8* %sp_1 to i32*

> store i32 %foo, i32* %sp_1_ptr, align 4

> %sp_2 = getelementptr i8, i8* %sp, i64 -8

> %sp_2_ptr = bitcast i8* %sp_2 to i32*

> store i32 %bar, i32* %sp_2_ptr, align 4

> %ret_1 = insertvalue { i32, i32, i8* } undef, i32 %bar, 0

> %ret_2 = insertvalue { i32, i32, i8* } %ret_1, i32 %foo, 1

> %ret_3 = insertvalue { i32, i32, i8* } %ret_2, i8* %sp, 2

> ret { i32, i32, i8* } %ret_3

> }

> My only questions are now:

> - How is it that inttoptr cannot provide that specific alias
> information so it can optimize that store/load away ?

Because nothing tracks what happens to the ints, and what happens
when they are converted back to pointers and whether it's sane :slight_smile:
http://llvm.org/docs/GetElementPtr.html#how-is-gep-different-from-ptrtoint-arithmetic-and-inttoptr

> - Might it be possible to get inttoptr providing such alias
> analysis
> ?

It doesn't make a lot of sense to try in most cases.
Most of the cases ptrtoint/inttoptr is useful are those where you
want to do crazy things to the pointer.

> - I came across MemorySSA while browsing though the llvm source. Is
> it possible that one can use MemorySSA to do such optimization
> without alias analysis ?

MemorySSA relies on alias analysis to generate the SSA form.

> - Where do I have to look in the source which is doing this kind of
> optimization (Is it instcombine which uses lib/Analysis/Loads.cpp
> ?)

It's probably a combination of opts. The most likely candidate is
-gvn, but I would look at the pass dumps after each opt

I recommend running your IR through opt with -print-after-all, and you'll see what optimization passes are doing what.

-Hal

Thanks for the answers. Although I am not sure if I’ve understood the docs about how inttoptr/ptrtointr are different when compared to gep.
It says: “It’s invalid to take a GEP from one object, address into a different separately allocated object, and dereference it.”.
To go back to my intention why I am doing this, I would like to “emulate” some x86 instructions with llvm-ir but as far as I understand that aliasing rule, I am not sure if I am breaking that rule.

For example when translating this x86 code to llvm ir:

push eax
add esp, 2
push ecx

; push foo (On “stack”)
%sp_1 = getelementptr i8, i8* %sp, i32 -4
%sp_1_ptr = bitcast i8* %sp_1 to i32*
store i32 %foo, i32* %sp_1_ptr, align 4

%sp_x = getelementptr i8, i8* %sp_1, i32 2

; push bar
%sp_2 = getelementptr i8, i8* %sp_x, i32 -4
%sp_2_ptr = bitcast i8* %sp_2 to i32*
store i32 %bar, i32* %sp_2_ptr, align 4

Both objects (eax, ecx) will overlap because of the size difference (eax = i32). What are the consequences when doing this. Will this break alias analysis for the further instructions?

From: "Paul Peet via llvm-dev" <llvm-dev@lists.llvm.org>
To: "Daniel Berlin" <dberlin@dberlin.org>
Cc: "llvm-dev" <llvm-dev@lists.llvm.org>
Sent: Wednesday, February 10, 2016 3:13:15 PM
Subject: Re: [llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

Thanks for the answers. Although I am not sure if I've understood the
docs about how inttoptr/ptrtointr are different when compared to
gep.
It says: "It’s invalid to take a GEP from one object, address into a
different separately allocated object, and dereference it.".

This refers to the underlying allocation that created the memory. Where did %sp come from? Is it an alloca instruction, or from some other source?

To go back to my intention why I am doing this, I would like to
"emulate" some x86 instructions with llvm-ir but as far as I
understand that aliasing rule, I am not sure if I am breaking that
rule.

For example when translating this x86 code to llvm ir:

push eax
add esp, 2
push ecx
...

; push foo (On "stack")
%sp_1 = getelementptr i8, i8* %sp, i32 -4
%sp_1_ptr = bitcast i8* %sp_1 to i32*
store i32 %foo, i32* %sp_1_ptr, align 4

%sp_x = getelementptr i8, i8* %sp_1, i32 2

; push bar
%sp_2 = getelementptr i8, i8* %sp_x, i32 -4
%sp_2_ptr = bitcast i8* %sp_2 to i32*
store i32 %bar, i32* %sp_2_ptr, align 4

Both objects (eax, ecx) will overlap because of the size difference
(eax = i32). What are the consequences when doing this. Will this
break alias analysis for the further instructions?

Partially overlapping writes to do not, in themselves, break anything. AA should handle that just fine.

-Hal

> From: "Paul Peet via llvm-dev" <llvm-dev@lists.llvm.org>
> To: "Daniel Berlin" <dberlin@dberlin.org>
> Cc: "llvm-dev" <llvm-dev@lists.llvm.org>
> Sent: Wednesday, February 10, 2016 3:13:15 PM
> Subject: Re: [llvm-dev] Memory Store/Load Optimization Issue (Emulating
     stack)
>
>
>
> Thanks for the answers. Although I am not sure if I've understood the
> docs about how inttoptr/ptrtointr are different when compared to
> gep.
> It says: "It’s invalid to take a GEP from one object, address into a
> different separately allocated object, and dereference it.".

This refers to the underlying allocation that created the memory. Where
did %sp come from? Is it an alloca instruction, or from some other source?

It's allocated via malloc and passed to the function.
define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp_x)

> To go back to my intention why I am doing this, I would like to
> "emulate" some x86 instructions with llvm-ir but as far as I
> understand that aliasing rule, I am not sure if I am breaking that
> rule.
>
>
> For example when translating this x86 code to llvm ir:
>
>
> push eax
> add esp, 2
> push ecx
> ...
>
>
>
> ; push foo (On "stack")
> %sp_1 = getelementptr i8, i8* %sp, i32 -4
> %sp_1_ptr = bitcast i8* %sp_1 to i32*
> store i32 %foo, i32* %sp_1_ptr, align 4
>
>
> %sp_x = getelementptr i8, i8* %sp_1, i32 2
>
>
> ; push bar
> %sp_2 = getelementptr i8, i8* %sp_x, i32 -4
> %sp_2_ptr = bitcast i8* %sp_2 to i32*
> store i32 %bar, i32* %sp_2_ptr, align 4
>
>
> Both objects (eax, ecx) will overlap because of the size difference
> (eax = i32). What are the consequences when doing this. Will this
> break alias analysis for the further instructions?
>

Partially overlapping writes to do not, in themselves, break anything. AA
should handle that just fine.

What do you mean by partially?

From: "Paul Peet" <paulpeet17@gmail.com>
To: "Hal Finkel" <hfinkel@anl.gov>
Cc: "llvm-dev" <llvm-dev@lists.llvm.org>, "Daniel Berlin" <dberlin@dberlin.org>
Sent: Wednesday, February 10, 2016 3:33:08 PM
Subject: Re: [llvm-dev] Memory Store/Load Optimization Issue (Emulating stack)

2016-02-10 22:23 GMT+01:00 Hal Finkel < hfinkel@anl.gov > :

> From: "Paul Peet via llvm-dev" < llvm-dev@lists.llvm.org >
> To: "Daniel Berlin" < dberlin@dberlin.org >
> Cc: "llvm-dev" < llvm-dev@lists.llvm.org >
> Sent: Wednesday, February 10, 2016 3:13:15 PM
> Subject: Re: [llvm-dev] Memory Store/Load Optimization Issue
> (Emulating stack)
>
>
>
> Thanks for the answers. Although I am not sure if I've understood
> the
> docs about how inttoptr/ptrtointr are different when compared to
> gep.
> It says: "It’s invalid to take a GEP from one object, address into
> a
> different separately allocated object, and dereference it.".

This refers to the underlying allocation that created the memory.
Where did %sp come from? Is it an alloca instruction, or from some
other source?

It's allocated via malloc and passed to the function.

define { i32, i32, i8* } @test(i32 %foo, i32 %bar, i8* %sp_x)

Then you're fine unless you run off the end of the memory you malloc'd. This is not saying anything tricky here: No buffer overruns. Even if there happens to be another valid object there, you can't run off the end of your allocated buffer (do so is undefined behavior).

> To go back to my intention why I am doing this, I would like to
> "emulate" some x86 instructions with llvm-ir but as far as I
> understand that aliasing rule, I am not sure if I am breaking that
> rule.
>
>
> For example when translating this x86 code to llvm ir:
>
>
> push eax
> add esp, 2
> push ecx
> ...
>
>
>
> ; push foo (On "stack")
> %sp_1 = getelementptr i8, i8* %sp, i32 -4
> %sp_1_ptr = bitcast i8* %sp_1 to i32*
> store i32 %foo, i32* %sp_1_ptr, align 4
>
>
> %sp_x = getelementptr i8, i8* %sp_1, i32 2
>
>
> ; push bar
> %sp_2 = getelementptr i8, i8* %sp_x, i32 -4
> %sp_2_ptr = bitcast i8* %sp_2 to i32*
> store i32 %bar, i32* %sp_2_ptr, align 4
>
>
> Both objects (eax, ecx) will overlap because of the size difference
> (eax = i32). What are the consequences when doing this. Will this
> break alias analysis for the further instructions?
>

Partially overlapping writes to do not, in themselves, break
anything. AA should handle that just fine.

What do you mean by partially?

It looks like the first two bytes of %bar overlap with the second two bytes of %foo. That's a partial overlap of the writes (as the writes are four bytes each).

-Hal

Hi again,

So I finally gave up on trying to get through the converting (x86' push pop
mov add) because it deals a lot with crazy pointer arithmetics and sonce
inttoptr and ptrtoint doesn't provide any alias analysis information.
Daniel, you said it doesn't make much sense to provide it but in my cases
it is actually very much needed, you didn't say it wasn't possible to
provide it but it is possible right? Could you guide me through where I
should look to implement such analysis? Would it be complex/non-trivial to
implement such thing?

Hey Paul,

Hi Daniel,

Hey Paul,

Hi again,

So I finally gave up on trying to get through the converting (x86' push
pop mov add) because it deals a lot with crazy pointer arithmetics and
sonce inttoptr and ptrtoint doesn't provide any alias analysis information.
Daniel, you said it doesn't make much sense to provide it but in my cases
it is actually very much needed, you didn't say it wasn't possible to
provide it but it is possible right?

Not in the general case, no.
(pardon the pseudocode)

void foo(struct foo *a)
{
c = ptrtoint (a)
c += 10
newptr = inttoptr(c)
}

Where does newptr point? Somewhere in a? To a field?

Well, you could try to do the analysis and do the math, but the amount of
calculation you are going to try to statically evaluate is unbounded (and
you can't statically evaluate all of it).
If you want to play this game, you need a good range analysis.
A friend of mine published a good paper on this that will appear in CGO
2016:

http://homepages.dcc.ufmg.br/~fernando/publications/papers/CGO16_paisante.pdf

First of all thank you for that paper, I quickly skimmed the paper and
well, I wasn't sure if it's "the" approach to deal with the issues I have
because it's about estimating bounds for memory access and I have no idea
how this would help. Or it's just me not actually understanding the alias
analysis.
I don't quite understand why I need bounds for the kind of optimization I
"want".

Because when translating x86 mov/push/pop/add instruction to similar LLVM
IR (preserving the semantics) you **need** pointer arithmetics and there
are no "bounds" (In terms of preserving the semantics that if an invalid
memory would occur).

Let me actually show you an example, how I would imagine an optimization
for doing these kinds of things:

Let's say we have these x86 instructions:

push ecx
push ecx
xchg esp, ecx
mov [ecx + 4], ebx
mov [ecx], eax
xchg esp, ecx
pop ebx
pop eax

My translator would emit something like this:

define { i32, i32, i32, i32 } @Unit01(i32 %eax, i32 %ebx, i32 %ecx, i32
%esp) {
  ; push ecx
  %esp_1 = sub i32 %esp, 4
  %esp_ptr1 = inttoptr i32 %esp_1 to i32*
  store i32 %ecx, i32* %esp_ptr1, align 4

  ; push ecx
  %esp_2 = sub i32 %esp_1, 4
  %esp_ptr2 = inttoptr i32 %esp_2 to i32*
  store i32 %ecx, i32* %esp_ptr2, align 4

  ; xchg ecx, esp - This should be eliminated as it is tracked by the
  ; Translator
  %esp_3 = bitcast i32 %ecx to i32
  %ecx_1 = bitcast i32 %esp_2 to i32

  ; mov [ecx + 4], ebx
  %mov_tmp1 = add i32 %ecx_1, 4
  %mov_ptr1 = inttoptr i32 %mov_tmp1 to i32*
  store i32 %ebx, i32* %mov_ptr1, align 4

  ; mov [ecx], eax
  %mov_ptr2 = inttoptr i32 %ecx_1 to i32*
  store i32 %eax, i32* %mov_ptr2, align 4

  %esp_4 = bitcast i32 %ecx_1 to i32
  %ecx_2 = bitcast i32 %esp_3 to i32

  ; pop ebx
  %pop_ptr1 = inttoptr i32 %esp_4 to i32*
  %ebx_1 = load i32, i32* %pop_ptr1, align 4
  %esp_5 = add i32 esp_4, 4

  ; pop eax
  %pop_ptr2 = inttoptr i32 %esp_5 to i32*
  %eax_1 = load i32, i32* %pop_ptr2, align 4
  %esp_6 = add i32 esp_5, 4

  ; Construct return object containing register values
  ; %eax_1, %ebx_1, %, %ecx_2, %esp_6
}

I could change the types (reg i32 types) to i8* so I could convert every
arithmetic operation to geps but I wouldn't be able to do other operations
rather than add/sub.

I could only give esp and ebp, i8* types so I can still do arithmetic on
other registers but that would be limited because what if I use xchg and
swap eax and esp? (I kinda would still need inttoptr/ptrtoint).

So there is only one option left, using i32 for every register.

My idea is to build symbolic expression for every store/load and treat
inttoptr/ptrtoint as noop instruction as if they were arithmetic
expression. These expressions would be store in a set. Then these
expressions can be simplified (eg. constant propagation) and then finally
compared when store/loads occur. If a store and load points to the same
symbolic expression, then they would point to the same memory location.

Example (for the llvm ir above):

; push eax
(1) Mem( ptr( (%esp - 4), i32) ) = %eax

; push eax
Mem( ptr( ((%esp - 4) - 4), i32) ) = %eax
=> ; Optimization
(2) Mem( ptr((%esp - 8), i32) ) = %eax

; mov [ecx + 4], ebx
Mem( ptr( (((%esp - 4) - 4) + 4), i32) ) = %ebx
=> ; Optimization
(3) Mem( ptr( (%esp - 4), i32) ) = %ebx

; mov [ecx], eax
Mem( ptr( ((%esp - 4) - 4), i32) ) = %ebx
=> ; Optimization
(4) Mem( ptr( (%esp - 8), i32) ) = %eax

; pop ebx
%ebx_1 = Mem( ptr( ((%esp - 4) - 4), i32 ) )
=> ; Optimization
(5) %ebx_1 = Mem( ptr( (%esp - 8), i32 ) )

; pop eax
%eax_1 = Mem( ptr( (((%esp - 4) - 4) + 4), i32 ) )
=> ; Optimization
(6) %eax_1 = Mem( ptr( (%esp - 4), i32 ) )

(5) LHS can be replaced with %eax because (5) and (4) have the same
symbolic expression. The same for (3) and (6).

This is a very trivial approach and has a lots of flaws but to go back to
the issue you mentioned.
I am not sure how I should understand the bounds?

You could try to do something like that to calculate the bounds of newptr