llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.85k stars 11.91k forks source link

wasm vector casts from float to a narrow integer type scalarize #74760

Open abadams opened 11 months ago

abadams commented 11 months ago

The following .ll produces to 16 scalar ops with control flow. Adding an intermediate cast to a 32-bit integer behaves as expected (does not scalarize), but that doesn't seem like it should be helpful. Adding +nontrapping-fptoint fixes it too, but IIUC fptoui is a poison value on overflow, so trying to lower it to something with overflow checking can't work. There may have been transformations already made to the llvm IR that are only correct for non-poison in-bounds values, and these transformations may have mapped those out-of-range poison values back in-range in a way that dodges the overflow checks.

; llc wasm_float_cast.ll -mtriple=wasm32-unknown--wasm -mattr=+simd128 -o -

define void @test(ptr noalias nocapture noundef readonly %in, ptr noalias nocapture noundef writeonly %out) {
entry:
  %fv.0.copyload = load <16 x float>, ptr %in, align 16
  %conv = fptoui <16 x float> %fv.0.copyload to <16 x i8>
  store <16 x i8> %conv, ptr %out, align 16
  ret void
}
    .text
    .file   "wasm_float_cast.ll"
    .functype   test (i32, i32) -> ()
    .section    .text.test,"",@
    .globl  test                            # -- Begin function test
    .type   test,@function
test:                                   # @test
    .functype   test (i32, i32) -> ()
    .local      v128, f32, i32, i32, v128
# %bb.0:                                # %entry
    block       
    block       
    local.get   0
    v128.load   0
    local.tee   2
    f32x4.extract_lane  1
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label1
# %bb.1:                                # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label0
.LBB0_2:                                # %entry
    end_block                               # label1:
    i32.const   0
    local.set   4
.LBB0_3:                                # %entry
    end_block                               # label0:
    block       
    block       
    local.get   2
    f32x4.extract_lane  0
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label3
# %bb.4:                                # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   5
    br          1                               # 1: down to label2
.LBB0_5:                                # %entry
    end_block                               # label3:
    i32.const   0
    local.set   5
.LBB0_6:                                # %entry
    end_block                               # label2:
    local.get   5
    i8x16.splat
    local.get   4
    i8x16.replace_lane  1
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  2
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label5
# %bb.7:                                # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label4
.LBB0_8:                                # %entry
    end_block                               # label5:
    i32.const   0
    local.set   4
.LBB0_9:                                # %entry
    end_block                               # label4:
    local.get   6
    local.get   4
    i8x16.replace_lane  2
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  3
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label7
# %bb.10:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label6
.LBB0_11:                               # %entry
    end_block                               # label7:
    i32.const   0
    local.set   4
.LBB0_12:                               # %entry
    end_block                               # label6:
    local.get   6
    local.get   4
    i8x16.replace_lane  3
    local.set   6
    block       
    block       
    local.get   0
    v128.load   16
    local.tee   2
    f32x4.extract_lane  0
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label9
# %bb.13:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label8
.LBB0_14:                               # %entry
    end_block                               # label9:
    i32.const   0
    local.set   4
.LBB0_15:                               # %entry
    end_block                               # label8:
    local.get   6
    local.get   4
    i8x16.replace_lane  4
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  1
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label11
# %bb.16:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label10
.LBB0_17:                               # %entry
    end_block                               # label11:
    i32.const   0
    local.set   4
.LBB0_18:                               # %entry
    end_block                               # label10:
    local.get   6
    local.get   4
    i8x16.replace_lane  5
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  2
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label13
# %bb.19:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label12
.LBB0_20:                               # %entry
    end_block                               # label13:
    i32.const   0
    local.set   4
.LBB0_21:                               # %entry
    end_block                               # label12:
    local.get   6
    local.get   4
    i8x16.replace_lane  6
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  3
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label15
# %bb.22:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label14
.LBB0_23:                               # %entry
    end_block                               # label15:
    i32.const   0
    local.set   4
.LBB0_24:                               # %entry
    end_block                               # label14:
    local.get   6
    local.get   4
    i8x16.replace_lane  7
    local.set   6
    block       
    block       
    local.get   0
    v128.load   32
    local.tee   2
    f32x4.extract_lane  0
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label17
# %bb.25:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label16
.LBB0_26:                               # %entry
    end_block                               # label17:
    i32.const   0
    local.set   4
.LBB0_27:                               # %entry
    end_block                               # label16:
    local.get   6
    local.get   4
    i8x16.replace_lane  8
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  1
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label19
# %bb.28:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label18
.LBB0_29:                               # %entry
    end_block                               # label19:
    i32.const   0
    local.set   4
.LBB0_30:                               # %entry
    end_block                               # label18:
    local.get   6
    local.get   4
    i8x16.replace_lane  9
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  2
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label21
# %bb.31:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label20
.LBB0_32:                               # %entry
    end_block                               # label21:
    i32.const   0
    local.set   4
.LBB0_33:                               # %entry
    end_block                               # label20:
    local.get   6
    local.get   4
    i8x16.replace_lane  10
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  3
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label23
# %bb.34:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   4
    br          1                               # 1: down to label22
.LBB0_35:                               # %entry
    end_block                               # label23:
    i32.const   0
    local.set   4
.LBB0_36:                               # %entry
    end_block                               # label22:
    local.get   6
    local.get   4
    i8x16.replace_lane  11
    local.set   6
    block       
    block       
    local.get   0
    i32.const   48
    i32.add 
    v128.load   0
    local.tee   2
    f32x4.extract_lane  0
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label25
# %bb.37:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   0
    br          1                               # 1: down to label24
.LBB0_38:                               # %entry
    end_block                               # label25:
    i32.const   0
    local.set   0
.LBB0_39:                               # %entry
    end_block                               # label24:
    local.get   6
    local.get   0
    i8x16.replace_lane  12
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  1
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label27
# %bb.40:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   0
    br          1                               # 1: down to label26
.LBB0_41:                               # %entry
    end_block                               # label27:
    i32.const   0
    local.set   0
.LBB0_42:                               # %entry
    end_block                               # label26:
    local.get   6
    local.get   0
    i8x16.replace_lane  13
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  2
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label29
# %bb.43:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   0
    br          1                               # 1: down to label28
.LBB0_44:                               # %entry
    end_block                               # label29:
    i32.const   0
    local.set   0
.LBB0_45:                               # %entry
    end_block                               # label28:
    local.get   6
    local.get   0
    i8x16.replace_lane  14
    local.set   6
    block       
    block       
    local.get   2
    f32x4.extract_lane  3
    local.tee   3
    f32.const   0x1p32
    f32.lt  
    local.get   3
    f32.const   0x0p0
    f32.ge  
    i32.and 
    i32.eqz
    br_if       0                               # 0: down to label31
# %bb.46:                               # %entry
    local.get   3
    i32.trunc_f32_u
    local.set   0
    br          1                               # 1: down to label30
.LBB0_47:                               # %entry
    end_block                               # label31:
    i32.const   0
    local.set   0
.LBB0_48:                               # %entry
    end_block                               # label30:
    local.get   1
    local.get   6
    local.get   0
    i8x16.replace_lane  15
    v128.store  0
                                        # fallthrough-return
    end_function
                                        # -- End function
    .section    .custom_section.target_features,"",@
    .int8   3
    .int8   43
    .int8   15
    .ascii  "mutable-globals"
    .int8   43
    .int8   8
    .ascii  "sign-ext"
    .int8   43
    .int8   7
    .ascii  "simd128"
    .section    .text.test,"",@
llvmbot commented 11 months ago

@llvm/issue-subscribers-backend-webassembly

Author: Andrew Adams (abadams)

The following .ll produces to 16 scalar ops with control flow. Adding an intermediate cast to a 32-bit integer behaves as expected (does not scalarize), but that doesn't seem like it should be helpful. Adding +nontrapping-fptoint fixes it too, but IIUC fptoui is a poison value on overflow, so trying to lower it to something with overflow checking can't work. There may have been transformations already made to the llvm IR that are only correct for non-poison in-bounds values, and these transformations may have mapped those out-of-range poison values back in-range in a way that dodges the overflow checks. ``` ; llc wasm_float_cast.ll -mtriple=wasm32-unknown--wasm -mattr=+simd128 -o - define void @test(ptr noalias nocapture noundef readonly %in, ptr noalias nocapture noundef writeonly %out) { entry: %fv.0.copyload = load <16 x float>, ptr %in, align 16 %conv = fptoui <16 x float> %fv.0.copyload to <16 x i8> store <16 x i8> %conv, ptr %out, align 16 ret void } ``` ``` .text .file "wasm_float_cast.ll" .functype test (i32, i32) -> () .section .text.test,"",@ .globl test # -- Begin function test .type test,@function test: # @test .functype test (i32, i32) -> () .local v128, f32, i32, i32, v128 # %bb.0: # %entry block block local.get 0 v128.load 0 local.tee 2 f32x4.extract_lane 1 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label1 # %bb.1: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label0 .LBB0_2: # %entry end_block # label1: i32.const 0 local.set 4 .LBB0_3: # %entry end_block # label0: block block local.get 2 f32x4.extract_lane 0 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label3 # %bb.4: # %entry local.get 3 i32.trunc_f32_u local.set 5 br 1 # 1: down to label2 .LBB0_5: # %entry end_block # label3: i32.const 0 local.set 5 .LBB0_6: # %entry end_block # label2: local.get 5 i8x16.splat local.get 4 i8x16.replace_lane 1 local.set 6 block block local.get 2 f32x4.extract_lane 2 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label5 # %bb.7: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label4 .LBB0_8: # %entry end_block # label5: i32.const 0 local.set 4 .LBB0_9: # %entry end_block # label4: local.get 6 local.get 4 i8x16.replace_lane 2 local.set 6 block block local.get 2 f32x4.extract_lane 3 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label7 # %bb.10: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label6 .LBB0_11: # %entry end_block # label7: i32.const 0 local.set 4 .LBB0_12: # %entry end_block # label6: local.get 6 local.get 4 i8x16.replace_lane 3 local.set 6 block block local.get 0 v128.load 16 local.tee 2 f32x4.extract_lane 0 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label9 # %bb.13: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label8 .LBB0_14: # %entry end_block # label9: i32.const 0 local.set 4 .LBB0_15: # %entry end_block # label8: local.get 6 local.get 4 i8x16.replace_lane 4 local.set 6 block block local.get 2 f32x4.extract_lane 1 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label11 # %bb.16: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label10 .LBB0_17: # %entry end_block # label11: i32.const 0 local.set 4 .LBB0_18: # %entry end_block # label10: local.get 6 local.get 4 i8x16.replace_lane 5 local.set 6 block block local.get 2 f32x4.extract_lane 2 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label13 # %bb.19: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label12 .LBB0_20: # %entry end_block # label13: i32.const 0 local.set 4 .LBB0_21: # %entry end_block # label12: local.get 6 local.get 4 i8x16.replace_lane 6 local.set 6 block block local.get 2 f32x4.extract_lane 3 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label15 # %bb.22: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label14 .LBB0_23: # %entry end_block # label15: i32.const 0 local.set 4 .LBB0_24: # %entry end_block # label14: local.get 6 local.get 4 i8x16.replace_lane 7 local.set 6 block block local.get 0 v128.load 32 local.tee 2 f32x4.extract_lane 0 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label17 # %bb.25: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label16 .LBB0_26: # %entry end_block # label17: i32.const 0 local.set 4 .LBB0_27: # %entry end_block # label16: local.get 6 local.get 4 i8x16.replace_lane 8 local.set 6 block block local.get 2 f32x4.extract_lane 1 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label19 # %bb.28: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label18 .LBB0_29: # %entry end_block # label19: i32.const 0 local.set 4 .LBB0_30: # %entry end_block # label18: local.get 6 local.get 4 i8x16.replace_lane 9 local.set 6 block block local.get 2 f32x4.extract_lane 2 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label21 # %bb.31: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label20 .LBB0_32: # %entry end_block # label21: i32.const 0 local.set 4 .LBB0_33: # %entry end_block # label20: local.get 6 local.get 4 i8x16.replace_lane 10 local.set 6 block block local.get 2 f32x4.extract_lane 3 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label23 # %bb.34: # %entry local.get 3 i32.trunc_f32_u local.set 4 br 1 # 1: down to label22 .LBB0_35: # %entry end_block # label23: i32.const 0 local.set 4 .LBB0_36: # %entry end_block # label22: local.get 6 local.get 4 i8x16.replace_lane 11 local.set 6 block block local.get 0 i32.const 48 i32.add v128.load 0 local.tee 2 f32x4.extract_lane 0 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label25 # %bb.37: # %entry local.get 3 i32.trunc_f32_u local.set 0 br 1 # 1: down to label24 .LBB0_38: # %entry end_block # label25: i32.const 0 local.set 0 .LBB0_39: # %entry end_block # label24: local.get 6 local.get 0 i8x16.replace_lane 12 local.set 6 block block local.get 2 f32x4.extract_lane 1 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label27 # %bb.40: # %entry local.get 3 i32.trunc_f32_u local.set 0 br 1 # 1: down to label26 .LBB0_41: # %entry end_block # label27: i32.const 0 local.set 0 .LBB0_42: # %entry end_block # label26: local.get 6 local.get 0 i8x16.replace_lane 13 local.set 6 block block local.get 2 f32x4.extract_lane 2 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label29 # %bb.43: # %entry local.get 3 i32.trunc_f32_u local.set 0 br 1 # 1: down to label28 .LBB0_44: # %entry end_block # label29: i32.const 0 local.set 0 .LBB0_45: # %entry end_block # label28: local.get 6 local.get 0 i8x16.replace_lane 14 local.set 6 block block local.get 2 f32x4.extract_lane 3 local.tee 3 f32.const 0x1p32 f32.lt local.get 3 f32.const 0x0p0 f32.ge i32.and i32.eqz br_if 0 # 0: down to label31 # %bb.46: # %entry local.get 3 i32.trunc_f32_u local.set 0 br 1 # 1: down to label30 .LBB0_47: # %entry end_block # label31: i32.const 0 local.set 0 .LBB0_48: # %entry end_block # label30: local.get 1 local.get 6 local.get 0 i8x16.replace_lane 15 v128.store 0 # fallthrough-return end_function # -- End function .section .custom_section.target_features,"",@ .int8 3 .int8 43 .int8 15 .ascii "mutable-globals" .int8 43 .int8 8 .ascii "sign-ext" .int8 43 .int8 7 .ascii "simd128" .section .text.test,"",@ ```