Clang produces individual memclr calls when setting large data structures

PiJoules commented 1 year ago

When compiling the following

#include <stdint.h>
#include <string.h>

/* Enum definitions */
typedef enum __global_state_SessionType {
    _global_state_SessionType_SESSION_TYPE_IDLE = 0,
    _global_state_SessionType_SESSION_TYPE_CALL = 1,
    _global_state_SessionType_SESSION_TYPE_MEDIA = 2
} _global_state_SessionType;

typedef enum __global_state_DisconnectType {
    _global_state_DisconnectType_DISCONNECT_TYPE_UNKNOWN = 0,
    _global_state_DisconnectType_DISCONNECT_TYPE_INTENTIONAL_BY_BUD = 1,
    _global_state_DisconnectType_DISCONNECT_TYPE_UNINTENTIONAL = 2,
    _global_state_DisconnectType_DISCONNECT_TYPE_INTENTIONAL_BY_USER = 3,
    _global_state_DisconnectType_DISCONNECT_TYPE_INTENTIONAL_CONNECTION_SWITCHING = 4
} _global_state_DisconnectType;

typedef enum __global_state_ActionTarget_RegularActionTarget {
    _global_state_ActionTarget_RegularActionTarget_REGULAR_ACTION_TARGET_UNKNOWN = 0,
    _global_state_ActionTarget_RegularActionTarget_CHECK_NOTIFICATIONS = 1,
    _global_state_ActionTarget_RegularActionTarget_PREVIOUS_TRACK_REPEAT = 2,
    _global_state_ActionTarget_RegularActionTarget_NEXT_TRACK = 3,
    _global_state_ActionTarget_RegularActionTarget_PLAY_PAUSE_TRACK = 4,
    _global_state_ActionTarget_RegularActionTarget_ANC_CONTROL = 5,
    _global_state_ActionTarget_RegularActionTarget_ASSISTANT_QUERY = 6
} _global_state_ActionTarget_RegularActionTarget;

/* Struct definitions */
typedef struct __global_state_ActionTarget {
    _global_state_ActionTarget_RegularActionTarget regular;
} _global_state_ActionTarget;

typedef struct __global_state_AudioSession {
    uint32_t id;
    uint8_t host_id;
    _global_state_SessionType type;
    uint32_t imprecise_duration_ms;
} _global_state_AudioSession;

typedef struct __global_state_BackstageConfig {
    bool has_headtracking_3dof_mode;
    bool headtracking_3dof_mode;
} _global_state_BackstageConfig;

typedef struct __global_state_ConversationModeState {
    uint32_t version;
    bool conversation_active;
} _global_state_ConversationModeState;

typedef struct __global_state_DeviceInformation {
    uint8_t host_id;
    uint8_t bt_address[6];
    bool is_sass_enabled;
    uint8_t order;
    _global_state_DisconnectType disconnect_type;
    bool is_bt_device_id_received;
    bool is_bt_device_vendor_id_sig;
    uint16_t bt_device_vendor_id;
    uint16_t bt_device_product_id;
    uint8_t fast_pair_platform;
    uint8_t fast_pair_firmware_version;
} _global_state_DeviceInformation;

typedef struct __global_state_DynamicBufferConfig {
    uint32_t version;
    uint32_t current_buffer_level_ms;
    uint32_t target_buffer_level_ms;
    uint32_t active_codec_id;
    bool low_latency_mode;
    bool low_latency_mode_preferred;
} _global_state_DynamicBufferConfig;

typedef struct __global_state_HostSession {
    uint32_t id;
    uint8_t host_id;
    uint32_t imprecise_duration_ms;
    uint32_t overall_session_id;
} _global_state_HostSession;

typedef struct __global_state_MediaManagerSettings {
    uint32_t version;
    bool ohd_music_paused_by_doff;
    bool media_paused_by_conversation;
} _global_state_MediaManagerSettings;

typedef struct __global_state_OutOfRangeEarconState {
    bool skip_connection_disconnection_earcon;
    uint8_t disconnected_host_id;
} _global_state_OutOfRangeEarconState;

typedef struct __global_state_OverallSession {
    uint32_t id;
    uint32_t imprecise_duration_ms;
} _global_state_OverallSession;

typedef struct __global_state_RandomDeviceId {
    uint32_t version;
    uint64_t random_device_id;
} _global_state_RandomDeviceId;

typedef struct __global_state_SpatialAudioState {
    uint32_t version;
    bool role_switch_lock;
    uint32_t connection_handle;
    uint32_t feature1;
    bool head_tracking_active;
    bool interrupt_channel_is_open;
} _global_state_SpatialAudioState;

typedef struct __global_state_SpotState {
    uint32_t version;
    bool is_ringing_event_spot;
    uint32_t ringing_state;
    uint32_t ringing_volume;
    uint32_t ringing_timeout;
    uint32_t left_spot_clock;
    uint32_t right_spot_clock;
} _global_state_SpotState;

typedef struct __global_state_ConnectivityStates {
    uint32_t version;
    bool _connected;
    bool debug_app_connected;
    uint32_t disconnect_reason;
    bool has_audio_session;
    _global_state_AudioSession audio_session;
    uint32_t role_swith_type;
    uint8_t active_host_id;
    bool has_host_session_0;
    _global_state_HostSession host_session_0;
    bool has_host_session_1;
    _global_state_HostSession host_session_1;
    bool has_overall_session;
    _global_state_OverallSession overall_session;
    uint32_t force_singlepoint_ux;
    bool has_out_of_range_earcon_state;
    _global_state_OutOfRangeEarconState out_of_range_earcon_state;
} _global_state_ConnectivityStates;

typedef struct __global_state_GestureMapping {
    bool has_hold;
    _global_state_ActionTarget hold;
} _global_state_GestureMapping;

typedef struct __global_state_PairedDeviceList {
    uint32_t version;
    size_t device_info_count;
    _global_state_DeviceInformation device_info[8];
} _global_state_PairedDeviceList;

typedef struct __global_state_GsmTouchControl {
    bool has_left;
    _global_state_GestureMapping left;
    bool has_right;
    _global_state_GestureMapping right;
} _global_state_GsmTouchControl;

typedef struct __AncToggleSetting {
    bool has_anc;
    bool anc;
    bool has_off;
    bool off;
    bool has_transparency;
    bool transparency;
} _AncToggleSetting;

typedef struct __UserEq {
    bool has_low_bass;
    float low_bass;
    bool has_bass;
    float bass;
    bool has_mid;
    float mid;
    bool has_treble;
    float treble;
    bool has_upper_treble;
    float upper_treble;
} _UserEq;

typedef struct __global_state_Settings {
    uint32_t version;
    bool auto_ota_enable;
    bool ohd_enable;
    bool oobe_is_finished;
    bool gesture_enable;
    uint32_t color_sku;
    bool diagnostics_enable;
    uint32_t version_counter;
    bool oobe_mode;
    uint8_t fmd_state;
    uint32_t fmd_host;
    bool volume_eq_enable;
    bool multipoint_enable;
    bool has_gesture_control;
    _global_state_GsmTouchControl gesture_control;
    int8_t ancr_state_one_bud;
    int8_t ancr_state_two_buds;
    bool otts_mode;
    bool has_ancr_gesture_loop;
    _AncToggleSetting ancr_gesture_loop;
    bool has_current_user_eq;
    _UserEq current_user_eq;
    int32_t volume_asymmetry;
    bool sum_to_mono_enable;
    bool has_last_saved_user_eq;
    _UserEq last_saved_user_eq;
    bool dosimeter_notifications_enable;
    bool has_backstage_config;
    _global_state_BackstageConfig backstage_config;
    bool conversation_mode_enable;
    int32_t dynamic_low_latency_state;
} _global_state_Settings;

typedef struct __global_state_State {
    bool has__settings;
    _global_state_Settings _settings;
    bool has_connectivity_states;
    _global_state_ConnectivityStates connectivity_states;
    bool has_media_manager_settings;
    _global_state_MediaManagerSettings media_manager_settings;
    bool has_random_device_id;
    _global_state_RandomDeviceId random_device_id;
    bool has_paired_device_list;
    _global_state_PairedDeviceList paired_device_list;
    bool has_dynamic_buffer_config;
    _global_state_DynamicBufferConfig dynamic_buffer_config;
    bool has_spatial_audio_state;
    _global_state_SpatialAudioState spatial_audio_state;
    bool has_conversation_mode_state;
    _global_state_ConversationModeState conversation_mode_state;
    bool has_spot_state;
    _global_state_SpotState spot_state;
} _global_state_State;

/* Initializer values for message structs */
#define _global_state_ActionTarget_init_default {_global_state_ActionTarget_RegularActionTarget_REGULAR_ACTION_TARGET_UNKNOWN}
#define _global_state_GestureMapping_init_default {false, _global_state_ActionTarget_init_default}
#define _global_state_GsmTouchControl_init_default {false, _global_state_GestureMapping_init_default, false, _global_state_GestureMapping_init_default}
#define _global_state_Settings_init_default {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, false, _global_state_GsmTouchControl_init_default, 0, 0, 0, false, {false, true, false, false, false, true}, false, {false, 0, false, 0, false, 0, false, 0, false, 0}, 0, 0, false, {false, 0, false, 0, false, 0, false, 0, false, 0}, 0, false, _global_state_BackstageConfig_init_default, 0, 0}
#define _global_state_BackstageConfig_init_default {false, 0}
#define _global_state_MediaManagerSettings_init_default {0, 0, 0}
#define _global_state_AudioSession_init_default {0, 0, _global_state_SessionType_SESSION_TYPE_IDLE, 0}
#define _global_state_HostSession_init_default {0, 0, 0, 0}
#define _global_state_OverallSession_init_default {0, 0}
#define _global_state_OutOfRangeEarconState_init_default {0, 0}
#define _global_state_ConnectivityStates_init_default {0, 0, 0, 0, false, _global_state_AudioSession_init_default, 0, 0, false, _global_state_HostSession_init_default, false, _global_state_HostSession_init_default, false, _global_state_OverallSession_init_default, 0, false, _global_state_OutOfRangeEarconState_init_default}
#define _global_state_RandomDeviceId_init_default {0, 0}
#define _global_state_DeviceInformation_init_default {0, {0}, 0, 0, _global_state_DisconnectType_DISCONNECT_TYPE_UNKNOWN, 0, 0, 0, 0, 0, 0}
#define _global_state_PairedDeviceList_init_default {0, 0, {_global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default, _global_state_DeviceInformation_init_default}}
#define _global_state_DynamicBufferConfig_init_default {0, 0, 0, 0, 0, 0}
#define _global_state_SpatialAudioState_init_default {0, 0, 0, 0, 0, 0}
#define _global_state_ConversationModeState_init_default {0, 0}
#define _global_state_SpotState_init_default {0, 0, 0, 0, 0, 0, 0}
#define _global_state_State_init_default   {false, _global_state_Settings_init_default, false, _global_state_ConnectivityStates_init_default, false, _global_state_MediaManagerSettings_init_default, false, _global_state_RandomDeviceId_init_default, false, _global_state_PairedDeviceList_init_default, false, _global_state_DynamicBufferConfig_init_default, false, _global_state_SpatialAudioState_init_default, false, _global_state_ConversationModeState_init_default, false, _global_state_SpotState_init_default}

class GlobalStateManager {
 public:
  void ResetChangedState();
  _global_state_State changed_state_old_value_;
  _global_state_State changed_state_new_value_;
};

void GlobalStateManager::ResetChangedState() {
  changed_state_old_value_ = _global_state_State_init_default;
  changed_state_new_value_ = _global_state_State_init_default;
}

with

clang++ -mcpu=cortex-m4 -mabi=aapcs -mthumb -mfloat-abi=soft -Qunused-arguments -nodefaultlibs --target=arm-none-eabi --sysroot=../.environment/cipd/packages/arm/arm-none-eabi -stdlib++-isystem../.environment/cipd/packages/arm/arm-none-eabi/include/c++/10.3.1 -Oz -ffreestanding -fno-pic -fomit-frame-pointer -fshort-enums -g -fno-common -fno-exceptions -ffunction-sections -fdata-sections ../bud/bt_core/global_state_manager/global_state_manager.cc -S -o -

Clang produces a GlobalStateManager::ResetChangedState that's over 800 bytes in size. The function is composed of inidividual calls to __aeabi_memclr in the form:

        .loc    0 2090 28 is_stmt 1
        addw    r0, r4, #1185
        .loc    0 2090 30 is_stmt 0
        movs    r1, #36
        .loc    0 2089 28 is_stmt 1
        strb.w  r7, [r4, #1152]
        strb.w  r7, [r4, #1140]
        strb.w  r7, [r4, #1116]
        strb.w  r7, [r4, #1092]
        strh.w  r7, [r4, #1090]
        .loc    0 2090 30
        bl      __aeabi_memclr

for every member in the structs that are zero. GCC however produces something significantly smaller:

        push    {r4, r5, r6, lr}
        mov     r4, r0
        mov     r6, #528
        movs    r5, #1
        mov     r2, r6
        movs    r1, #0
        add     r0, r0, #656
.LVL218:
        bl      memset
.LVL219:
        strb    r5, [r4, #695]
        strb    r5, [r4, #699]
        mov     r2, r6
        movs    r1, #0
        add     r0, r4, #1184
        bl      memset
.LVL220:
        movs    r3, #0
        strb    r5, [r4, #1223]
        strb    r5, [r4, #1227]
        strb    r3, [r4, #652]
        pop     {r4, r5, r6, pc}

Since very few members get initialized to non-zero values, GCC instead memsets both members then sets individual members to non-zero values. At -Oz, it would be nice if Clang was able to produce something like this since just replacing the Clang definition with the GCC one saves ~750 bytes of text just for this one function.

llvmbot commented 1 year ago

@llvm/issue-subscribers-clang-codegen

PiJoules commented 1 year ago

Updating with Richard Smith’s response:

Seems likely that the IR emitted by Clang doesn’t let LLVM know that it’s allowed to overwrite the padding. If you make a constexpr variable to hold the initializer and assign from that instead of using the macro, clang generates a memcpy instead.

Can confirm using a constexpr variable allows clang to clear the whole struct then set individual non-zero values to members.

efriedma-quic commented 1 year ago

It's probably still worth tracking this as a general optimization, even if you've adopted a workaround for your specific usage.

PiJoules commented 6 months ago

Ok did some digging and I think I have a little more idea of what's going on. Here I'm just looking at one member:

void GlobalStateManager::ResetChangedState() {
  changed_state_old_value_ = _global_state_State_init_default;
}

So when using an initializer via the macro, the IR clang emits is an alloca for the large struct, a memset(0) for that struct, then a long sequence of GEPs and individual stores for that alloca, then a final memcpy into the this ptr:

define dso_local void @_ZN18GlobalStateManager17ResetChangedStateEv(ptr noundef nonnull align 8 dereferenceable(1056) %this) #0 align 2 {
entry:
  %this.addr = alloca ptr, align 4
  %ref.tmp = alloca %struct.__global_state_State, align 8
  store ptr %this, ptr %this.addr, align 4
  %this1 = load ptr, ptr %this.addr, align 4
  call void @llvm.memset.p0.i64(ptr align 8 %ref.tmp, i8 0, i64 528, i1 false)
  %has__settings = getelementptr inbounds %struct.__global_state_State, ptr %ref.tmp, i32 0, i32 0
  store i8 0, ptr %has__settings, align 8
  %_settings = getelementptr inbounds %struct.__global_state_State, ptr %ref.tmp, i32 0, i32 1
  %has_gesture_control = getelementptr inbounds %struct.__global_state_Settings, ptr %_settings, i32 0, i32 13
  store i8 0, ptr %has_gesture_control, align 2
  %gesture_control = getelementptr inbounds %struct.__global_state_Settings, ptr %_settings, i32 0, i32 14
... lots of GEPs + stores for the individual members
  store i8 0, ptr %has_spot_state, align 8
  %changed_state_old_value_ = getelementptr inbounds %class.GlobalStateManager, ptr %this1, i32 0, i32 0
  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %changed_state_old_value_, ptr align 8 %ref.tmp, i32 528, i1 false)
  ret void
}

With the default opt pipeline, SROA will transform it into something like this:

define dso_local void @_ZN18GlobalStateManager17ResetChangedStateEv(ptr noundef nonnull writeonly align 8 dereferenceable(1056) %this) local_unnamed_addr #0 align 2 {
entry:
  %ref.tmp.sroa.5 = alloca [33 x i8], align 1
  %ref.tmp.sroa.12 = alloca { i8, i8, i8 }, align 4
  %ref.tmp.sroa.21 = alloca [7 x i8], align 1
  %ref.tmp.sroa.22 = alloca [7 x i8], align 1
  %ref.tmp.sroa.23 = alloca [7 x i8], align 1
  %ref.tmp.sroa.24 = alloca [7 x i8], align 1
  %ref.tmp.sroa.25 = alloca [12 x i8], align 1
  %ref.tmp.sroa.27 = alloca [7 x i8], align 1
  %ref.tmp.sroa.28 = alloca [7 x i8], align 1
  %ref.tmp.sroa.29 = alloca [7 x i8], align 1
  %ref.tmp.sroa.30 = alloca [7 x i8], align 1
  %ref.tmp.sroa.33 = alloca [9 x i8], align 1
  %ref.tmp.sroa.34 = alloca [15 x i8], align 1
  %ref.tmp.sroa.36 = alloca [11 x i8], align 2
  %ref.tmp.sroa.37 = alloca [18 x i8], align 2
  %ref.tmp.sroa.38 = alloca [19 x i8], align 1
  %ref.tmp.sroa.39 = alloca [15 x i8], align 1
  %ref.tmp.sroa.40 = alloca [3 x i8], align 1
  %ref.tmp.sroa.41 = alloca [11 x i8], align 1
  %ref.tmp.sroa.42 = alloca [19 x i8], align 1
  %ref.tmp.sroa.43 = alloca [155 x i8], align 1
  %ref.tmp.sroa.45 = alloca [23 x i8], align 1
  %ref.tmp.sroa.46 = alloca [23 x i8], align 1
  %ref.tmp.sroa.47 = alloca [11 x i8], align 1
  %ref.tmp.sroa.48 = alloca [31 x i8], align 1
... each alloca has its own GEPs + stores

which eventually gets transformed to

define dso_local void @_ZN18GlobalStateManager17ResetChangedStateEv(ptr nocapture noundef nonnull writeonly align 8 dereferenceable(1056) %this) local_unnamed_addr #0 align 2 {
entry:
  %ref.tmp.sroa.43 = alloca [155 x i8], align 1
  call void @llvm.lifetime.start.p0(i64 155, ptr nonnull %ref.tmp.sroa.43)
  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(11) %ref.tmp.sroa.43, i8 0, i64 11, i1 false)
  %ref.tmp.sroa.43.11.device_info.sroa_idx = getelementptr inbounds i8, ptr %ref.tmp.sroa.43, i32 11
  call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(144) %ref.tmp.sroa.43.11.device_info.sroa_idx, i8 0, i32 144, i1 false)
  store i8 0, ptr %this, align 8, !tbaa !3
  %ref.tmp.sroa.5.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 1
  %ref.tmp.sroa.14.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 46
  tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(45) %ref.tmp.sroa.5.0.changed_state_old_value_.sroa_idx, i8 0, i64 45, i1 false)
  store i8 1, ptr %ref.tmp.sroa.14.0.changed_state_old_value_.sroa_idx, align 2, !tbaa !3
  %ref.tmp.sroa.15.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 47
  store i8 0, ptr %ref.tmp.sroa.15.0.changed_state_old_value_.sroa_idx, align 1, !tbaa !3
  %ref.tmp.sroa.16.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 48
  store i8 0, ptr %ref.tmp.sroa.16.0.changed_state_old_value_.sroa_idx, align 8, !tbaa !3
  %ref.tmp.sroa.17.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 49
  store i8 0, ptr %ref.tmp.sroa.17.0.changed_state_old_value_.sroa_idx, align 1, !tbaa !3
  %ref.tmp.sroa.18.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 50
  store i8 1, ptr %ref.tmp.sroa.18.0.changed_state_old_value_.sroa_idx, align 2, !tbaa !3
  %ref.tmp.sroa.19.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 51
  store i8 0, ptr %ref.tmp.sroa.19.0.changed_state_old_value_.sroa_idx, align 1, !tbaa !3
  %ref.tmp.sroa.20.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 52
  store i8 0, ptr %ref.tmp.sroa.20.0.changed_state_old_value_.sroa_idx, align 4, !tbaa !3
  %ref.tmp.sroa.21.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 53
  %ref.tmp.sroa.43.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 281
  tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(228) %ref.tmp.sroa.21.0.changed_state_old_value_.sroa_idx, i8 0, i64 228, i1 false)
  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(155) %ref.tmp.sroa.43.0.changed_state_old_value_.sroa_idx, ptr noundef nonnull align 1 dereferenceable(155) %ref.tmp.sroa.43, i32 155, i1 false), !tbaa.struct !7
  %ref.tmp.sroa.44.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 436
  store i8 0, ptr %ref.tmp.sroa.44.0.changed_state_old_value_.sroa_idx, align 4, !tbaa !3
  %ref.tmp.sroa.45.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 437
  tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(91) %ref.tmp.sroa.45.0.changed_state_old_value_.sroa_idx, i8 0, i64 91, i1 false)
  call void @llvm.lifetime.end.p0(i64 155, ptr nonnull %ref.tmp.sroa.43)
  ret void
}

Then this will get lowered to what we see in the original bug description:

_ZN18GlobalStateManager17ResetChangedStateEv:
        .fnstart
@ %bb.0:                                @ %entry
        .save   {r4, r5, r6, lr}
        push    {r4, r5, r6, lr}
        .pad    #160
        sub     sp, #160
        movs    r6, #0
        mov     r4, r0
        movs    r1, #144
        str.w   r6, [sp, #11]
        strd    r6, r6, [sp, #4]
        add     r5, sp, #4
        add.w   r0, r5, #11
        bl      __aeabi_memclr
        adds    r0, r4, #1
        movs    r1, #45
        strb    r6, [r4]
        bl      __aeabi_memclr
        movs    r0, #1
        movs    r1, #228
        strb.w  r6, [r4, #52]
        strh    r0, [r4, #50]
        str.w   r0, [r4, #46]
        add.w   r0, r4, #53
        bl      __aeabi_memclr
        addw    r0, r4, #281
        mov     r1, r5
        movs    r2, #155
        bl      __aeabi_memcpy
        addw    r0, r4, #437
        movs    r1, #91
        strb.w  r6, [r4, #436]
        bl      __aeabi_memclr
        add     sp, #160
        pop     {r4, r5, r6, pc}

It looks like SROA is just doing its job of splitting the larger alloca into a bunch of smaller disjoint (potentially non-contiguous) allocas that get their own store/memset instructions. Then this just gets lowered normally to stores and memclr calls. Because of the split, we can't do one large memclr over the initial alloca like in the original IR (probably) because these allocas might might not be contiguous or may be laid out in any order. It looks like SROA determines the number of slices and sizes by the use of each pointer, so because we have a lot of uses via GEPs and stores, we end up with a lot of slices.

This is contrary to the IR using constexpr which is just

define dso_local void @_ZN18GlobalStateManager18ResetChangedState2Ev(ptr noundef nonnull align 8 dereferenceable(1056) %this) #0 align 2 {
entry:
  %this.addr = alloca ptr, align 4
  %unique_name = alloca %struct.__global_state_State, align 8
  store ptr %this, ptr %this.addr, align 4
  %this1 = load ptr, ptr %this.addr, align 4
  call void @llvm.memset.p0.i32(ptr align 8 %unique_name, i8 0, i32 528, i1 false)
  %0 = getelementptr inbounds %struct.__global_state_State, ptr %unique_name, i32 0, i32 1
  %1 = getelementptr inbounds %struct.__global_state_Settings, ptr %0, i32 0, i32 19
  %2 = getelementptr inbounds %struct.__AncToggleSetting, ptr %1, i32 0, i32 1
  store i8 1, ptr %2, align 1
  %3 = getelementptr inbounds %struct.__AncToggleSetting, ptr %1, i32 0, i32 5
  store i8 1, ptr %3, align 1
  %changed_state_old_value_ = getelementptr inbounds %class.GlobalStateManager, ptr %this1, i32 0, i32 0
  call void @llvm.memcpy.p0.p0.i32(ptr align 8 %changed_state_old_value_, ptr align 8 %unique_name, i32 528, i1 false)
  ret void
}

Here, clang emits the same alloca followed by the same memset(0) but only 2 GEPs and 2 stores for the only non-zero members of the whole struct. I think because it has much fewer uses, SROA splits it into fewer slices

define dso_local void @_ZN18GlobalStateManager18ResetChangedState2Ev(ptr nocapture noundef nonnull writeonly align 8 dereferenceable(1056) %this) local_unnamed_addr #4 align 2 {
entry:
  %unique_name.sroa.0 = alloca [46 x i8], align 8
  %unique_name.sroa.5 = alloca { i8, i8, i8 }, align 4
  %unique_name.sroa.6 = alloca [477 x i8], align 1
  call void @llvm.lifetime.start.p0(i64 46, ptr %unique_name.sroa.0)
  call void @llvm.lifetime.start.p0(i64 3, ptr %unique_name.sroa.5)
  call void @llvm.lifetime.start.p0(i64 477, ptr %unique_name.sroa.6)
  call void @llvm.memset.p0.i32(ptr align 8 %unique_name.sroa.0, i8 0, i32 46, i1 false)
  call void @llvm.memset.p0.i32(ptr align 4 %unique_name.sroa.5, i8 0, i32 3, i1 false)
  call void @llvm.memset.p0.i32(ptr align 1 %unique_name.sroa.6, i8 0, i32 477, i1 false)
  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(46) %this, ptr noundef nonnull align 8 dereferenceable(46) %unique_name.sroa.0, i32 46, i1 false), !tbaa.struct !11
  %unique_name.sroa.4.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 46
  store i8 1, ptr %unique_name.sroa.4.0.changed_state_old_value_.sroa_idx, align 2, !tbaa !3
  %unique_name.sroa.5.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 47
  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %unique_name.sroa.5.0.changed_state_old_value_.sroa_idx, ptr noundef nonnull align 4 dereferenceable(3) %unique_name.sroa.5, i32 3, i1 false), !tbaa.struct !20
  %unique_name.sroa.52.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 50
  store i8 1, ptr %unique_name.sroa.52.0.changed_state_old_value_.sroa_idx, align 2, !tbaa !3
  %unique_name.sroa.6.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 51
  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(477) %unique_name.sroa.6.0.changed_state_old_value_.sroa_idx, ptr noundef nonnull align 1 dereferenceable(477) %unique_name.sroa.6, i32 477, i1 false), !tbaa.struct !21
  call void @llvm.lifetime.end.p0(i64 46, ptr %unique_name.sroa.0)
  call void @llvm.lifetime.end.p0(i64 3, ptr %unique_name.sroa.5)
  call void @llvm.lifetime.end.p0(i64 477, ptr %unique_name.sroa.6)
  ret void
}

which eventually gets transformed to

define dso_local void @_ZN18GlobalStateManager18ResetChangedState2Ev(ptr nocapture noundef nonnull writeonly align 8 dereferenceable(1056) %this) local_unnamed_addr #4 align 2 {
entry:
  tail call void @llvm.memset.p0.i32(ptr noundef nonnull align 8 dereferenceable(46) %this, i8 0, i32 46, i1 false)
  %unique_name.sroa.4.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 46
  store i8 1, ptr %unique_name.sroa.4.0.changed_state_old_value_.sroa_idx, align 2, !tbaa !3
  %unique_name.sroa.5.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 47
  tail call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %unique_name.sroa.5.0.changed_state_old_value_.sroa_idx, i8 0, i32 3, i1 false)
  %unique_name.sroa.52.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 50
  store i8 1, ptr %unique_name.sroa.52.0.changed_state_old_value_.sroa_idx, align 2, !tbaa !3
  %unique_name.sroa.6.0.changed_state_old_value_.sroa_idx = getelementptr inbounds i8, ptr %this, i32 51
  tail call void @llvm.memset.p0.i32(ptr noundef nonnull align 1 dereferenceable(477) %unique_name.sroa.6.0.changed_state_old_value_.sroa_idx, i8 0, i32 477, i1 false)
  ret void
}

Not an SROA expert, but it looks to me that the pass isn't doing anything wrong here since it should be perfectly legal to split the alloca up. I see these potential ways to address this:

Because we always do a memset(0) right after the alloca and before any of the GEPs + stores, it should be possible to remove redundant store 0s which removes extra uses of the original alloca. Then the initializer-list example should look a lot closer to the constexpr example with just 2 GEPs and 2 stores. This could be done before SROA is run on the function, or as a separate pass that always gets run before SROA (if one doesn't exist already). Having something in LLVM means other frontends also get access to this pass.
Because both of these are essentially constant (struct) values, I think it should be possible to just create llvm::Constants on the frontend then do a single store into the this ptr rather than do an alloca then copy the alloca. This particular struct (I think) satisfies the first-class type requirement for store values.

efriedma-quic commented 6 months ago

Not sure where the memset(0) over the whole struct is coming from... I'd have to look a bit more. But I'm not sure if clang generates a memset like that in all relevant cases.

There's a pass ordering issue here; if you run memcpyopt, the extra alloca that's still hanging around after -O2. Maybe this would be better if SROA had some handling for memset.

Probably clang could be a bit smarter in a few ways. We have some code to try to emit a copy from a global constant in AggExprEmitter::EmitArrayInit, but I don't think we do something equivalent for structs. And we could maybe try to optimize structs that contain a lot of zeros to avoid storing the zeros one-by-one. And maybe if we wanted to be really clever, we could avoid the temporary alloca altogether, but it's a little tricky given C++ semantics require the temporary in the general case.

llvm / llvm-project

Clang produces individual memclr calls when setting large data structures #62813