Load CSE for consecutive bitfield accesses through pointer

adamszilagyi · January 4, 2024, 3:34pm

Recently I’ve came across an example which generates suboptimal code related to bitfields. The example is the following:

struct bits {
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 1;
};

struct bits *b;

void clear_bits(void) { 
  b->b0 = 
  b->b1 = 
  b->b2 = 
  b->b4 = 
  b->b6 = 
  b->b7 = 0;
}

The IR for this code is this

define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
  %0 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load = load i8, ptr %0, align 4
  %bf.clear = and i8 %bf.load, 127
  %bf.set = or i8 %bf.clear, 0
  store i8 %bf.set, ptr %0, align 4
  %1 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load1 = load i8, ptr %1, align 4
  %bf.clear2 = and i8 %bf.load1, -65
  %bf.set3 = or i8 %bf.clear2, 0
  store i8 %bf.set3, ptr %1, align 4
  %2 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load4 = load i8, ptr %2, align 4
  %bf.clear5 = and i8 %bf.load4, -49
  %bf.set6 = or i8 %bf.clear5, 0
  store i8 %bf.set6, ptr %2, align 4
  %3 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load7 = load i8, ptr %3, align 4
  %bf.clear8 = and i8 %bf.load7, -13
  %bf.set9 = or i8 %bf.clear8, 0
  store i8 %bf.set9, ptr %3, align 4
  %4 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load10 = load i8, ptr %4, align 4
  %bf.clear11 = and i8 %bf.load10, -3
  %bf.set12 = or i8 %bf.clear11, 0
  store i8 %bf.set12, ptr %4, align 4
  %5 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load13 = load i8, ptr %5, align 4
  %bf.clear14 = and i8 %bf.load13, -2
  %bf.set15 = or i8 %bf.clear14, 0
  store i8 %bf.set15, ptr %5, align 4
  ret void
}

And the result assembly for ARM is this

clear_bits:
        ldr     r0, .LCPI0_0
.LPC0_0:
        ldr     r0, [pc, r0]
        ldrb    r1, [r0]
        and     r1, r1, #223
        strb    r1, [r0]
        ldr     r0, .LCPI0_1
.LPC0_1:
        ldr     r0, [pc, r0]
        ldrb    r1, [r0]
        and     r1, r1, #239
        strb    r1, [r0]
        ldr     r0, .LCPI0_2
.LPC0_2:
        ldr     r0, [pc, r0]
        ldrb    r1, [r0]
        and     r1, r1, #247
        strb    r1, [r0]
        ldr     r0, .LCPI0_3
.LPC0_3:
        ldr     r0, [pc, r0]
        ldrb    r1, [r0]
        and     r1, r1, #251
        strb    r1, [r0]
        ldr     r0, .LCPI0_4
.LPC0_4:
        ldr     r0, [pc, r0]
        ldrb    r1, [r0]
        and     r1, r1, #253
        strb    r1, [r0]
        ldr     r0, .LCPI0_5
.LPC0_5:
        ldr     r0, [pc, r0]
        ldrb    r1, [r0]
        and     r1, r1, #254
        strb    r1, [r0]
        bx      lr
.LCPI0_0:
        .long   b-(.LPC0_0+8)
.LCPI0_1:
        .long   b-(.LPC0_1+8)
.LCPI0_2:
        .long   b-(.LPC0_2+8)
.LCPI0_3:
        .long   b-(.LPC0_3+8)
.LCPI0_4:
        .long   b-(.LPC0_4+8)
.LCPI0_5:
        .long   b-(.LPC0_5+8)
b:
        .long   0

GCC generates the optimal code for this example, because it realises that every consecutive load/store pair are referencing the same 1 byte memory and therefore it removes them and only the first load and the last store remains, the result assembly looks like this

clear_bits:
        adrp    x0, .LANCHOR0
        ldr     x1, [x0, #:lo12:.LANCHOR0]
        ldrb    w0, [x1]
        and     w0, w0, -64
        strb    w0, [x1]
        ret

This should be the result in LLVM too. If the struct is allocated on the stack or in BSS, then we get the optimal result for LLVM too

struct bits {
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 1;
};

struct bits b;

void clear_bits(void) { 
  b.b0 = 
  b.b1 = 
  b.b2 = 
  b.b4 = 
  b.b6 = 
  b.b7 = 0;
}

With this example the result is the following

clear_bits:
        ldr     r0, .LCPI0_0
.LPC0_0:
        add     r0, pc, r0
        ldrb    r1, [r0]
        and     r1, r1, #192
        strb    r1, [r0]
        bx      lr
.LCPI0_0:
        .long   b-(.LPC0_0+8)
b:
        .zero   4

I’m creating this topic to discuss the potential implementation for this optimization, and i have a couple of question, because im not that familiar with the LLVM middle-end. I’ve got a feedback for my first try with this issue, which I’ve implemented in the EarlyCSE pass, that i should use type based alias analysis metadata for the bitfield accesses instead. I checked out the codegen for member expressions and came across this comment in the code in CGExpr.cpp: EmitLValueForField

    // TODO: Support TBAA for bit fields.
    LValueBaseInfo FieldBaseInfo(BaseInfo.getAlignmentSource());
    return LValue::MakeBitfield(Addr, Info, fieldType, FieldBaseInfo,
                                TBAAAccessInfo());

So first I’ve tried to implement the TBAA for bitfields because it was suggested that if the TBAA is present maybe some optimization will deal with this example using this metadata and i don’t need to implement an different optimization for this, so I’ve tried, and came up with this (which is potentially wrong since im not an expert in this field)

    TBAAAccessInfo FieldTBAAInfo = base.getTBAAInfo();
    if (!FieldTBAAInfo.BaseType) {
      FieldTBAAInfo.BaseType = CGM.getTBAABaseTypeInfo(base.getType());
      assert(!FieldTBAAInfo.Offset &&
             "Nonzero offset for an access with no base type!");
    }

    if (FieldTBAAInfo.BaseType)
      FieldTBAAInfo.Offset += UseVolatile
                                  ? Info.VolatileStorageOffset.getQuantity()
                                  : Info.StorageOffset.getQuantity();

    FieldTBAAInfo.AccessType = CGM.getTBAATypeInfo(fieldType);
    FieldTBAAInfo.Size =
        getContext().getTypeSizeInChars(fieldType).getQuantity();

    FieldTBAAInfo.Kind = TBAAAccessKind::MayAlias;

    LValueBaseInfo FieldBaseInfo(BaseInfo.getAlignmentSource());
    return LValue::MakeBitfield(Addr, Info, fieldType, FieldBaseInfo,
                                FieldTBAAInfo);

But the example is still wrong after this. So next I’ve dug deeper to find where are optimizations which can be potentially solve my problem with TBAAInfo and why it is not the case after creating TBAAInfo for bitfield accesses. Then I’ve found this in InstCombineLoadStoreAlloca.cpp: visitLoadInst

  // Do really simple store-to-load forwarding and load CSE, to catch cases
  // where there are several consecutive memory accesses to the same location,
  // separated by a few arithmetic operations.
  bool IsLoadCSE = false;
  if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE))

which starts from a load instruction and goes back in the basic block and tries to find another load which loads the same value, collecting instructions along the way which may modify memory. This optimization realises that the consecutive bitfield loads are the same, but because there are stores in between, it uses the alias analysis to decide if the store modifies that given memory location which we load from. This alias analysis gives back MayAlias so the redundant load is not removed. At this point i should mention that for some reason there is no TBAAInfo for the store but i’ve created the TBAAInfo for the bitfield access, but i don’t understand exactly how these metadata nodes are connected with the instructions in the middle-end level, but even if i had the TBAAInfo for the store the result should be MustAlias and the redundant load won’t be removed. So basically I’m stuck with the implementation, and any help could be great.

One side note that the reason why my EarlyCSE implementation is not good is because i quote

This optimization is not legal for the example as written. @b might be stored inside @b, in which case the store to %0 will change the value returned by the next load from @b.

And i’ve tested the implementation with examples where @b is stored inside @b and for me the result is good, or im missing something

Some examples with results

Example 1

struct bits {
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  struct bits *b;
  unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 2;
};

struct bits *b;

void clear_bits(void) {
  b->b0 = 0;
  b->b1 = 0;
  b->b2 = 0;
  b->b4 = 0;
  b->b = b;
  b->b6 = 0;
  b->b7 = 0;
}

result

clear_bits:
	.fnstart
@ %bb.0:                                @ %entry
	ldr	r0, .LCPI0_0
.LPC0_0:
	ldr	r0, [pc, r0]
	ldrb	r1, [r0]
	ldrb	r2, [r0, #8]
	and	r1, r1, #240
	strb	r1, [r0]
	and	r1, r2, #252
	strb	r1, [r0, #8]
	ldr	r0, .LCPI0_1
.LPC0_1:
	ldr	r0, [pc, r0]
	ldrb	r1, [r0, #8]
	str	r0, [r0, #4]
	and	r1, r1, #227
	strb	r1, [r0, #8]
	bx	lr

Example 2

struct bits {
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  struct bits *b;
  unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 2;
};

struct bits *b;

void clear_bits(void) {
  b->b0 = 0;
  b->b1 = 0;
  b->b2 = 0;
  b->b4 = 0;
  b->b6 = 0;
  b->b7 = 0;
}

result

clear_bits:
	.fnstart
@ %bb.0:                                @ %entry
	ldr	r0, .LCPI0_0
.LPC0_0:
	ldr	r0, [pc, r0]
	ldrb	r1, [r0]
	ldrb	r2, [r0, #8]
	and	r1, r1, #240
	strb	r1, [r0]
	and	r1, r2, #252
	strb	r1, [r0, #8]
	ldr	r0, .LCPI0_1
.LPC0_1:
	ldr	r0, [pc, r0]
	ldrb	r1, [r0, #8]
	and	r1, r1, #227
	strb	r1, [r0, #8]
	bx	lr
	.p2align	2

Example 3

struct bits {
  struct bits *b;
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 2;
};

struct bits *b;

void clear_bits(void) {
  b->b0 = 0;
  b->b1 = 0;
  b->b2 = 0;
  b->b4 = 0;
  b->b6 = 0;
  b->b7 = 0;
}

result

clear_bits:
	.fnstart
@ %bb.0:                                @ %entry
	ldr	r0, .LCPI0_0
.LPC0_0:
	ldr	r0, [pc, r0]
	ldrh	r1, [r0, #4]
	and	r1, r1, #65024
	strh	r1, [r0, #4]
	bx	lr
	.p2align	2

Example 4

struct bits {
  struct bits *b;
  unsigned b0 : 1;
  unsigned b1 : 1;
  unsigned b2 : 2;
  unsigned b4 : 2;
  unsigned b6 : 1;
  unsigned b7 : 2;
};

struct bits *b;

void clear_bits(void) {
  b->b0 = 0;
  b->b1 = 0;
  b->b2 = 0;
  b->b = b;
  b->b4 = 0;
  b->b6 = 0;
  b->b7 = 0;
}

result

clear_bits:
	.fnstart
@ %bb.0:                                @ %entry
	ldr	r0, .LCPI0_0
.LPC0_0:
	ldr	r0, [pc, r0]
	ldrh	r1, [r0, #4]
	str	r0, [r0]
	and	r1, r1, #65024
	strh	r1, [r0, #4]
	bx	lr
	.p2align	2

Thank you for your help!

mehdi_amini · January 4, 2024, 7:44pm

I believe the example would rather be something like:

void init() {
  b = (struct bits *)&b;
}

If init() is called before clear_bits() then your first store aliases the following load here:

define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
  %0 = load ptr, ptr @b, align 4, !tbaa !5
  %bf.load = load i8, ptr %0, align 4
  %bf.clear = and i8 %bf.load, 127
  %bf.set = or i8 %bf.clear, 0
  store i8 %bf.set, ptr %0, align 4
  %1 = load ptr, ptr @b, align 4, !tbaa !5

But b = (struct bits *)&b; isn’t valid C++, and !tbaa would carry this to the IR by conveying that storing to unsigned b0 shouldn’t possibly alias with the load of struct bits *.

Topic		Replies	Views
[RFC] Loading Bitfields with Smallest Needed Types LLVM Dev List Archives	21	229	May 30, 2020
[RFC] bitfield access shrinking LLVM Dev List Archives	14	79	March 13, 2017
Confused by MemorySSA / EarlyCSE aliasing rules for global values and function parameters IR & Optimizations clang , llvm	4	228	September 19, 2022
Field sensitive alias analysis? LLVM Dev List Archives	13	96	December 10, 2015
Optimization hints for "constant" loads LLVM Dev List Archives	25	163	December 4, 2014

Load CSE for consecutive bitfield accesses through pointer

Related topics