Recently I’ve came across an example which generates suboptimal code related to bitfields. The example is the following:
struct bits {
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 1;
};
struct bits *b;
void clear_bits(void) {
b->b0 =
b->b1 =
b->b2 =
b->b4 =
b->b6 =
b->b7 = 0;
}
The IR for this code is this
define dso_local arm_aapcscc void @clear_bits() #0 {
entry:
%0 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load = load i8, ptr %0, align 4
%bf.clear = and i8 %bf.load, 127
%bf.set = or i8 %bf.clear, 0
store i8 %bf.set, ptr %0, align 4
%1 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load1 = load i8, ptr %1, align 4
%bf.clear2 = and i8 %bf.load1, -65
%bf.set3 = or i8 %bf.clear2, 0
store i8 %bf.set3, ptr %1, align 4
%2 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load4 = load i8, ptr %2, align 4
%bf.clear5 = and i8 %bf.load4, -49
%bf.set6 = or i8 %bf.clear5, 0
store i8 %bf.set6, ptr %2, align 4
%3 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load7 = load i8, ptr %3, align 4
%bf.clear8 = and i8 %bf.load7, -13
%bf.set9 = or i8 %bf.clear8, 0
store i8 %bf.set9, ptr %3, align 4
%4 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load10 = load i8, ptr %4, align 4
%bf.clear11 = and i8 %bf.load10, -3
%bf.set12 = or i8 %bf.clear11, 0
store i8 %bf.set12, ptr %4, align 4
%5 = load ptr, ptr @b, align 4, !tbaa !5
%bf.load13 = load i8, ptr %5, align 4
%bf.clear14 = and i8 %bf.load13, -2
%bf.set15 = or i8 %bf.clear14, 0
store i8 %bf.set15, ptr %5, align 4
ret void
}
And the result assembly for ARM is this
clear_bits:
ldr r0, .LCPI0_0
.LPC0_0:
ldr r0, [pc, r0]
ldrb r1, [r0]
and r1, r1, #223
strb r1, [r0]
ldr r0, .LCPI0_1
.LPC0_1:
ldr r0, [pc, r0]
ldrb r1, [r0]
and r1, r1, #239
strb r1, [r0]
ldr r0, .LCPI0_2
.LPC0_2:
ldr r0, [pc, r0]
ldrb r1, [r0]
and r1, r1, #247
strb r1, [r0]
ldr r0, .LCPI0_3
.LPC0_3:
ldr r0, [pc, r0]
ldrb r1, [r0]
and r1, r1, #251
strb r1, [r0]
ldr r0, .LCPI0_4
.LPC0_4:
ldr r0, [pc, r0]
ldrb r1, [r0]
and r1, r1, #253
strb r1, [r0]
ldr r0, .LCPI0_5
.LPC0_5:
ldr r0, [pc, r0]
ldrb r1, [r0]
and r1, r1, #254
strb r1, [r0]
bx lr
.LCPI0_0:
.long b-(.LPC0_0+8)
.LCPI0_1:
.long b-(.LPC0_1+8)
.LCPI0_2:
.long b-(.LPC0_2+8)
.LCPI0_3:
.long b-(.LPC0_3+8)
.LCPI0_4:
.long b-(.LPC0_4+8)
.LCPI0_5:
.long b-(.LPC0_5+8)
b:
.long 0
GCC generates the optimal code for this example, because it realises that every consecutive load/store pair are referencing the same 1 byte memory and therefore it removes them and only the first load and the last store remains, the result assembly looks like this
clear_bits:
adrp x0, .LANCHOR0
ldr x1, [x0, #:lo12:.LANCHOR0]
ldrb w0, [x1]
and w0, w0, -64
strb w0, [x1]
ret
This should be the result in LLVM too. If the struct is allocated on the stack or in BSS, then we get the optimal result for LLVM too
struct bits {
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 1;
};
struct bits b;
void clear_bits(void) {
b.b0 =
b.b1 =
b.b2 =
b.b4 =
b.b6 =
b.b7 = 0;
}
With this example the result is the following
clear_bits:
ldr r0, .LCPI0_0
.LPC0_0:
add r0, pc, r0
ldrb r1, [r0]
and r1, r1, #192
strb r1, [r0]
bx lr
.LCPI0_0:
.long b-(.LPC0_0+8)
b:
.zero 4
I’m creating this topic to discuss the potential implementation for this optimization, and i have a couple of question, because im not that familiar with the LLVM middle-end. I’ve got a feedback for my first try with this issue, which I’ve implemented in the EarlyCSE pass, that i should use type based alias analysis metadata for the bitfield accesses instead. I checked out the codegen for member expressions and came across this comment in the code in CGExpr.cpp: EmitLValueForField
// TODO: Support TBAA for bit fields.
LValueBaseInfo FieldBaseInfo(BaseInfo.getAlignmentSource());
return LValue::MakeBitfield(Addr, Info, fieldType, FieldBaseInfo,
TBAAAccessInfo());
So first I’ve tried to implement the TBAA for bitfields because it was suggested that if the TBAA is present maybe some optimization will deal with this example using this metadata and i don’t need to implement an different optimization for this, so I’ve tried, and came up with this (which is potentially wrong since im not an expert in this field)
TBAAAccessInfo FieldTBAAInfo = base.getTBAAInfo();
if (!FieldTBAAInfo.BaseType) {
FieldTBAAInfo.BaseType = CGM.getTBAABaseTypeInfo(base.getType());
assert(!FieldTBAAInfo.Offset &&
"Nonzero offset for an access with no base type!");
}
if (FieldTBAAInfo.BaseType)
FieldTBAAInfo.Offset += UseVolatile
? Info.VolatileStorageOffset.getQuantity()
: Info.StorageOffset.getQuantity();
FieldTBAAInfo.AccessType = CGM.getTBAATypeInfo(fieldType);
FieldTBAAInfo.Size =
getContext().getTypeSizeInChars(fieldType).getQuantity();
FieldTBAAInfo.Kind = TBAAAccessKind::MayAlias;
LValueBaseInfo FieldBaseInfo(BaseInfo.getAlignmentSource());
return LValue::MakeBitfield(Addr, Info, fieldType, FieldBaseInfo,
FieldTBAAInfo);
But the example is still wrong after this. So next I’ve dug deeper to find where are optimizations which can be potentially solve my problem with TBAAInfo and why it is not the case after creating TBAAInfo for bitfield accesses. Then I’ve found this in InstCombineLoadStoreAlloca.cpp: visitLoadInst
// Do really simple store-to-load forwarding and load CSE, to catch cases
// where there are several consecutive memory accesses to the same location,
// separated by a few arithmetic operations.
bool IsLoadCSE = false;
if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE))
which starts from a load
instruction and goes back in the basic block and tries to find another load
which loads the same value, collecting instructions along the way which may modify memory. This optimization realises that the consecutive bitfield loads are the same, but because there are stores in between, it uses the alias analysis to decide if the store modifies that given memory location which we load from. This alias analysis gives back MayAlias
so the redundant load is not removed. At this point i should mention that for some reason there is no TBAAInfo for the store but i’ve created the TBAAInfo for the bitfield access, but i don’t understand exactly how these metadata nodes are connected with the instructions in the middle-end level, but even if i had the TBAAInfo for the store the result should be MustAlias
and the redundant load
won’t be removed. So basically I’m stuck with the implementation, and any help could be great.
One side note that the reason why my EarlyCSE implementation is not good is because i quote
This optimization is not legal for the example as written. @b might be stored inside @b, in which case the store to %0 will change the value returned by the next load from @b.
And i’ve tested the implementation with examples where @b
is stored inside @b
and for me the result is good, or im missing something
Some examples with results
Example 1
struct bits {
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
struct bits *b;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 2;
};
struct bits *b;
void clear_bits(void) {
b->b0 = 0;
b->b1 = 0;
b->b2 = 0;
b->b4 = 0;
b->b = b;
b->b6 = 0;
b->b7 = 0;
}
result
clear_bits:
.fnstart
@ %bb.0: @ %entry
ldr r0, .LCPI0_0
.LPC0_0:
ldr r0, [pc, r0]
ldrb r1, [r0]
ldrb r2, [r0, #8]
and r1, r1, #240
strb r1, [r0]
and r1, r2, #252
strb r1, [r0, #8]
ldr r0, .LCPI0_1
.LPC0_1:
ldr r0, [pc, r0]
ldrb r1, [r0, #8]
str r0, [r0, #4]
and r1, r1, #227
strb r1, [r0, #8]
bx lr
Example 2
struct bits {
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
struct bits *b;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 2;
};
struct bits *b;
void clear_bits(void) {
b->b0 = 0;
b->b1 = 0;
b->b2 = 0;
b->b4 = 0;
b->b6 = 0;
b->b7 = 0;
}
result
clear_bits:
.fnstart
@ %bb.0: @ %entry
ldr r0, .LCPI0_0
.LPC0_0:
ldr r0, [pc, r0]
ldrb r1, [r0]
ldrb r2, [r0, #8]
and r1, r1, #240
strb r1, [r0]
and r1, r2, #252
strb r1, [r0, #8]
ldr r0, .LCPI0_1
.LPC0_1:
ldr r0, [pc, r0]
ldrb r1, [r0, #8]
and r1, r1, #227
strb r1, [r0, #8]
bx lr
.p2align 2
Example 3
struct bits {
struct bits *b;
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 2;
};
struct bits *b;
void clear_bits(void) {
b->b0 = 0;
b->b1 = 0;
b->b2 = 0;
b->b4 = 0;
b->b6 = 0;
b->b7 = 0;
}
result
clear_bits:
.fnstart
@ %bb.0: @ %entry
ldr r0, .LCPI0_0
.LPC0_0:
ldr r0, [pc, r0]
ldrh r1, [r0, #4]
and r1, r1, #65024
strh r1, [r0, #4]
bx lr
.p2align 2
Example 4
struct bits {
struct bits *b;
unsigned b0 : 1;
unsigned b1 : 1;
unsigned b2 : 2;
unsigned b4 : 2;
unsigned b6 : 1;
unsigned b7 : 2;
};
struct bits *b;
void clear_bits(void) {
b->b0 = 0;
b->b1 = 0;
b->b2 = 0;
b->b = b;
b->b4 = 0;
b->b6 = 0;
b->b7 = 0;
}
result
clear_bits:
.fnstart
@ %bb.0: @ %entry
ldr r0, .LCPI0_0
.LPC0_0:
ldr r0, [pc, r0]
ldrh r1, [r0, #4]
str r0, [r0]
and r1, r1, #65024
strh r1, [r0, #4]
bx lr
.p2align 2
Thank you for your help!