llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.01k stars 11.95k forks source link

[SLP] Vectorizing jumbled memory loads causes big regressions on AARch64 #35021

Open eastig opened 6 years ago

eastig commented 6 years ago
Bugzilla Link 35673
Version trunk
OS All
CC @efriedma-quic,@fhahn,@hfinkel,@RKSimon

Extended Description

The patch https://reviews.llvm.org/D36130 caused 27.7% and 30.2% regressions on an AArch64 Juno board (http://lnt.llvm.org/db_default/v4/nts/83681):

MultiSource/Benchmarks/mediabench/gsm/toast/toast: 30.20% MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm: 27.73%

The sources of the benchmarks: http://www.llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/mediabench/gsm/toast/ http://www.llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/MiBench/telecomm-gsm/

Analysis of mediabench/gsm/toast shows that there is an issue with lowering IR created by SLP Vectorizer into efficient code. A file of interest is long_term.c (http://www.llvm.org/viewvc/llvm-project/test-suite/trunk/MultiSource/Benchmarks/mediabench/gsm/toast/long_term.c?view=markup). A function of interest are Gsm_Long_Term_Predictor.

The vectorised version of the main loop is lowered to 232 instructions. The non-vectorised version of the main loop is lowered to 100 instructions.

The following IR

for.body49.i: ; preds = %for.body49.i, %if.end22.thread.i %indvars.iv.i = phi i64 [ 40, %if.end22.thread.i ], [ %indvars.iv.next.i, %for.body49.i ] %lambda.0671.i = phi i32 [ 40, %if.end22.thread.i ], [ %inc413.i, %for.body49.i ] %Nc.0670.i = phi i16 [ 40, %if.end22.thread.i ], [ %spec.select660.i, %for.body49.i ] %L_max.0669.i = phi i64 [ 0, %if.end22.thread.i ], [ %spec.select659.i, %for.body49.i ] %42 = sub nsw i64 0, %indvars.iv.i %arrayidx54.i = getelementptr inbounds i16, i16 %dp, i64 %42 %43 = load i16, i16 %arrayidx54.i, align 2, !tbaa !​2 %conv55.i = sext i16 %43 to i64 %mul.i = mul nsw i64 %conv51.i, %conv55.i %44 = sub nsw i64 1, %indvars.iv.i %arrayidx61.i = getelementptr inbounds i16, i16 %dp, i64 %44 %45 = load i16, i16 %arrayidx61.i, align 2, !tbaa !​2 %conv62.i = sext i16 %45 to i64 %mul63.i = mul nsw i64 %conv58.i, %conv62.i %add.i = add nsw i64 %mul63.i, %mul.i %46 = sub nsw i64 2, %indvars.iv.i %arrayidx69.i = getelementptr inbounds i16, i16 %dp, i64 %46 %47 = load i16, i16 %arrayidx69.i, align 2, !tbaa !​2 %conv70.i = sext i16 %47 to i64 %mul71.i = mul nsw i64 %conv66.i, %conv70.i %add73.i = add nsw i64 %add.i, %mul71.i %48 = sub nsw i64 3, %indvars.iv.i %arrayidx78.i = getelementptr inbounds i16, i16 %dp, i64 %48 %49 = load i16, i16 %arrayidx78.i, align 2, !tbaa !​2 %conv79.i = sext i16 %49 to i64 %mul80.i = mul nsw i64 %conv75.i, %conv79.i %add82.i = add nsw i64 %add73.i, %mul80.i %50 = sub nsw i64 4, %indvars.iv.i %arrayidx87.i = getelementptr inbounds i16, i16 %dp, i64 %50 %51 = load i16, i16 %arrayidx87.i, align 2, !tbaa !​2 %conv88.i = sext i16 %51 to i64 %mul89.i = mul nsw i64 %conv84.i, %conv88.i %add91.i = add nsw i64 %add82.i, %mul89.i %52 = sub nsw i64 5, %indvars.iv.i %arrayidx96.i = getelementptr inbounds i16, i16 %dp, i64 %52 %53 = load i16, i16 %arrayidx96.i, align 2, !tbaa !​2 %conv97.i = sext i16 %53 to i64 %mul98.i = mul nsw i64 %conv93.i, %conv97.i %add100.i = add nsw i64 %add91.i, %mul98.i %54 = sub nsw i64 6, %indvars.iv.i %arrayidx105.i = getelementptr inbounds i16, i16 %dp, i64 %54 %55 = load i16, i16 %arrayidx105.i, align 2, !tbaa !​2 %conv106.i = sext i16 %55 to i64 %mul107.i = mul nsw i64 %conv102.i, %conv106.i %add109.i = add nsw i64 %add100.i, %mul107.i %56 = sub nsw i64 7, %indvars.iv.i %arrayidx114.i = getelementptr inbounds i16, i16 %dp, i64 %56 %57 = load i16, i16 %arrayidx114.i, align 2, !tbaa !​2 %conv115.i = sext i16 %57 to i64 %mul116.i = mul nsw i64 %conv111.i, %conv115.i %add118.i = add nsw i64 %add109.i, %mul116.i %58 = sub nsw i64 8, %indvars.iv.i %arrayidx123.i = getelementptr inbounds i16, i16 %dp, i64 %58 %59 = load i16, i16 %arrayidx123.i, align 2, !tbaa !​2 %conv124.i = sext i16 %59 to i64 %mul125.i = mul nsw i64 %conv120.i, %conv124.i %add127.i = add nsw i64 %add118.i, %mul125.i %60 = sub nsw i64 9, %indvars.iv.i %arrayidx132.i = getelementptr inbounds i16, i16 %dp, i64 %60 %61 = load i16, i16 %arrayidx132.i, align 2, !tbaa !​2 %conv133.i = sext i16 %61 to i64 %mul134.i = mul nsw i64 %conv129.i, %conv133.i %add136.i = add nsw i64 %add127.i, %mul134.i %62 = sub nsw i64 10, %indvars.iv.i %arrayidx141.i = getelementptr inbounds i16, i16 %dp, i64 %62 %63 = load i16, i16 %arrayidx141.i, align 2, !tbaa !​2 %conv142.i = sext i16 %63 to i64 %mul143.i = mul nsw i64 %conv138.i, %conv142.i %add145.i = add nsw i64 %add136.i, %mul143.i %64 = sub nsw i64 11, %indvars.iv.i %arrayidx150.i = getelementptr inbounds i16, i16 %dp, i64 %64 %65 = load i16, i16 %arrayidx150.i, align 2, !tbaa !​2 %conv151.i = sext i16 %65 to i64 %mul152.i = mul nsw i64 %conv147.i, %conv151.i %add154.i = add nsw i64 %add145.i, %mul152.i %66 = sub nsw i64 12, %indvars.iv.i %arrayidx159.i = getelementptr inbounds i16, i16 %dp, i64 %66 %67 = load i16, i16 %arrayidx159.i, align 2, !tbaa !​2 %conv160.i = sext i16 %67 to i64 %mul161.i = mul nsw i64 %conv156.i, %conv160.i %add163.i = add nsw i64 %add154.i, %mul161.i %68 = sub nsw i64 13, %indvars.iv.i %arrayidx168.i = getelementptr inbounds i16, i16 %dp, i64 %68 %69 = load i16, i16 %arrayidx168.i, align 2, !tbaa !​2 %conv169.i = sext i16 %69 to i64 %mul170.i = mul nsw i64 %conv165.i, %conv169.i %add172.i = add nsw i64 %add163.i, %mul170.i %70 = sub nsw i64 14, %indvars.iv.i %arrayidx177.i = getelementptr inbounds i16, i16 %dp, i64 %70 %71 = load i16, i16 %arrayidx177.i, align 2, !tbaa !​2 %conv178.i = sext i16 %71 to i64 %mul179.i = mul nsw i64 %conv174.i, %conv178.i %add181.i = add nsw i64 %add172.i, %mul179.i %72 = sub nsw i64 15, %indvars.iv.i %arrayidx186.i = getelementptr inbounds i16, i16 %dp, i64 %72 %73 = load i16, i16 %arrayidx186.i, align 2, !tbaa !​2 %conv187.i = sext i16 %73 to i64 %mul188.i = mul nsw i64 %conv183.i, %conv187.i %add190.i = add nsw i64 %add181.i, %mul188.i %74 = sub nsw i64 16, %indvars.iv.i %arrayidx195.i = getelementptr inbounds i16, i16 %dp, i64 %74 %75 = load i16, i16 %arrayidx195.i, align 2, !tbaa !​2 %conv196.i = sext i16 %75 to i64 %mul197.i = mul nsw i64 %conv192.i, %conv196.i %add199.i = add nsw i64 %add190.i, %mul197.i %76 = sub nsw i64 17, %indvars.iv.i %arrayidx204.i = getelementptr inbounds i16, i16 %dp, i64 %76 %77 = load i16, i16 %arrayidx204.i, align 2, !tbaa !​2 %conv205.i = sext i16 %77 to i64 %mul206.i = mul nsw i64 %conv201.i, %conv205.i %add208.i = add nsw i64 %add199.i, %mul206.i %78 = sub nsw i64 18, %indvars.iv.i %arrayidx213.i = getelementptr inbounds i16, i16 %dp, i64 %78 %79 = load i16, i16 %arrayidx213.i, align 2, !tbaa !​2 %conv214.i = sext i16 %79 to i64 %mul215.i = mul nsw i64 %conv210.i, %conv214.i %add217.i = add nsw i64 %add208.i, %mul215.i %80 = sub nsw i64 19, %indvars.iv.i %arrayidx222.i = getelementptr inbounds i16, i16 %dp, i64 %80 %81 = load i16, i16 %arrayidx222.i, align 2, !tbaa !​2 %conv223.i = sext i16 %81 to i64 %mul224.i = mul nsw i64 %conv219.i, %conv223.i %add226.i = add nsw i64 %add217.i, %mul224.i %82 = sub nsw i64 20, %indvars.iv.i %arrayidx231.i = getelementptr inbounds i16, i16 %dp, i64 %82 %83 = load i16, i16 %arrayidx231.i, align 2, !tbaa !​2 %conv232.i = sext i16 %83 to i64 %mul233.i = mul nsw i64 %conv228.i, %conv232.i %add235.i = add nsw i64 %add226.i, %mul233.i %84 = sub nsw i64 21, %indvars.iv.i %arrayidx240.i = getelementptr inbounds i16, i16 %dp, i64 %84 %85 = load i16, i16 %arrayidx240.i, align 2, !tbaa !​2 %conv241.i = sext i16 %85 to i64 %mul242.i = mul nsw i64 %conv237.i, %conv241.i %add244.i = add nsw i64 %add235.i, %mul242.i %86 = sub nsw i64 22, %indvars.iv.i %arrayidx249.i = getelementptr inbounds i16, i16 %dp, i64 %86 %87 = load i16, i16 %arrayidx249.i, align 2, !tbaa !​2 %conv250.i = sext i16 %87 to i64 %mul251.i = mul nsw i64 %conv246.i, %conv250.i %add253.i = add nsw i64 %add244.i, %mul251.i %88 = sub nsw i64 23, %indvars.iv.i %arrayidx258.i = getelementptr inbounds i16, i16 %dp, i64 %88 %89 = load i16, i16 %arrayidx258.i, align 2, !tbaa !​2 %conv259.i = sext i16 %89 to i64 %mul260.i = mul nsw i64 %conv255.i, %conv259.i %add262.i = add nsw i64 %add253.i, %mul260.i %90 = sub nsw i64 24, %indvars.iv.i %arrayidx267.i = getelementptr inbounds i16, i16 %dp, i64 %90 %91 = load i16, i16 %arrayidx267.i, align 2, !tbaa !​2 %conv268.i = sext i16 %91 to i64 %mul269.i = mul nsw i64 %conv264.i, %conv268.i %add271.i = add nsw i64 %add262.i, %mul269.i %92 = sub nsw i64 25, %indvars.iv.i %arrayidx276.i = getelementptr inbounds i16, i16 %dp, i64 %92 %93 = load i16, i16 %arrayidx276.i, align 2, !tbaa !​2 %conv277.i = sext i16 %93 to i64 %mul278.i = mul nsw i64 %conv273.i, %conv277.i %add280.i = add nsw i64 %add271.i, %mul278.i %94 = sub nsw i64 26, %indvars.iv.i %arrayidx285.i = getelementptr inbounds i16, i16 %dp, i64 %94 %95 = load i16, i16 %arrayidx285.i, align 2, !tbaa !​2 %conv286.i = sext i16 %95 to i64 %mul287.i = mul nsw i64 %conv282.i, %conv286.i %add289.i = add nsw i64 %add280.i, %mul287.i %96 = sub nsw i64 27, %indvars.iv.i %arrayidx294.i = getelementptr inbounds i16, i16 %dp, i64 %96 %97 = load i16, i16 %arrayidx294.i, align 2, !tbaa !​2 %conv295.i = sext i16 %97 to i64 %mul296.i = mul nsw i64 %conv291.i, %conv295.i %add298.i = add nsw i64 %add289.i, %mul296.i %98 = sub nsw i64 28, %indvars.iv.i %arrayidx303.i = getelementptr inbounds i16, i16 %dp, i64 %98 %99 = load i16, i16 %arrayidx303.i, align 2, !tbaa !​2 %conv304.i = sext i16 %99 to i64 %mul305.i = mul nsw i64 %conv300.i, %conv304.i %add307.i = add nsw i64 %add298.i, %mul305.i %100 = sub nsw i64 29, %indvars.iv.i %arrayidx312.i = getelementptr inbounds i16, i16 %dp, i64 %100 %101 = load i16, i16 %arrayidx312.i, align 2, !tbaa !​2 %conv313.i = sext i16 %101 to i64 %mul314.i = mul nsw i64 %conv309.i, %conv313.i %add316.i = add nsw i64 %add307.i, %mul314.i %102 = sub nsw i64 30, %indvars.iv.i %arrayidx321.i = getelementptr inbounds i16, i16 %dp, i64 %102 %103 = load i16, i16 %arrayidx321.i, align 2, !tbaa !​2 %conv322.i = sext i16 %103 to i64 %mul323.i = mul nsw i64 %conv318.i, %conv322.i %add325.i = add nsw i64 %add316.i, %mul323.i %104 = sub nsw i64 31, %indvars.iv.i %arrayidx330.i = getelementptr inbounds i16, i16 %dp, i64 %104 %105 = load i16, i16 %arrayidx330.i, align 2, !tbaa !​2 %conv331.i = sext i16 %105 to i64 %mul332.i = mul nsw i64 %conv327.i, %conv331.i %add334.i = add nsw i64 %add325.i, %mul332.i %106 = sub nsw i64 32, %indvars.iv.i %arrayidx339.i = getelementptr inbounds i16, i16 %dp, i64 %106 %107 = load i16, i16 %arrayidx339.i, align 2, !tbaa !​2 %conv340.i = sext i16 %107 to i64 %mul341.i = mul nsw i64 %conv336.i, %conv340.i %add343.i = add nsw i64 %add334.i, %mul341.i %108 = sub nsw i64 33, %indvars.iv.i %arrayidx348.i = getelementptr inbounds i16, i16 %dp, i64 %108 %109 = load i16, i16 %arrayidx348.i, align 2, !tbaa !​2 %conv349.i = sext i16 %109 to i64 %mul350.i = mul nsw i64 %conv345.i, %conv349.i %add352.i = add nsw i64 %add343.i, %mul350.i %110 = sub nsw i64 34, %indvars.iv.i %arrayidx357.i = getelementptr inbounds i16, i16 %dp, i64 %110 %111 = load i16, i16 %arrayidx357.i, align 2, !tbaa !​2 %conv358.i = sext i16 %111 to i64 %mul359.i = mul nsw i64 %conv354.i, %conv358.i %add361.i = add nsw i64 %add352.i, %mul359.i %112 = sub nsw i64 35, %indvars.iv.i %arrayidx366.i = getelementptr inbounds i16, i16 %dp, i64 %112 %113 = load i16, i16 %arrayidx366.i, align 2, !tbaa !​2 %conv367.i = sext i16 %113 to i64 %mul368.i = mul nsw i64 %conv363.i, %conv367.i %add370.i = add nsw i64 %add361.i, %mul368.i %114 = sub nsw i64 36, %indvars.iv.i %arrayidx375.i = getelementptr inbounds i16, i16 %dp, i64 %114 %115 = load i16, i16 %arrayidx375.i, align 2, !tbaa !​2 %conv376.i = sext i16 %115 to i64 %mul377.i = mul nsw i64 %conv372.i, %conv376.i %add379.i = add nsw i64 %add370.i, %mul377.i %116 = sub nsw i64 37, %indvars.iv.i %arrayidx384.i = getelementptr inbounds i16, i16 %dp, i64 %116 %117 = load i16, i16 %arrayidx384.i, align 2, !tbaa !​2 %conv385.i = sext i16 %117 to i64 %mul386.i = mul nsw i64 %conv381.i, %conv385.i %add388.i = add nsw i64 %add379.i, %mul386.i %118 = sub nsw i64 38, %indvars.iv.i %arrayidx393.i = getelementptr inbounds i16, i16 %dp, i64 %118 %119 = load i16, i16 %arrayidx393.i, align 2, !tbaa !​2 %conv394.i = sext i16 %119 to i64 %mul395.i = mul nsw i64 %conv390.i, %conv394.i %add397.i = add nsw i64 %add388.i, %mul395.i %120 = sub nsw i64 39, %indvars.iv.i %arrayidx402.i = getelementptr inbounds i16, i16 %dp, i64 %120 %121 = load i16, i16 %arrayidx402.i, align 2, !tbaa !​2 %conv403.i = sext i16 %121 to i64 %mul404.i = mul nsw i64 %conv399.i, %conv403.i %add406.i = add nsw i64 %add397.i, %mul404.i %cmp407.i = icmp sgt i64 %add406.i, %L_max.0669.i %conv410.i = trunc i32 %lambda.0671.i to i16 %spec.select659.i = select i1 %cmp407.i, i64 %add406.i, i64 %L_max.0669.i %spec.select660.i = select i1 %cmp407.i, i16 %conv410.i, i16 %Nc.0670.i %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 %inc413.i = add nuw nsw i32 %lambda.0671.i, 1 %exitcond.i = icmp eq i64 %indvars.iv.next.i, 121 br i1 %exitcond.i, label %for.end414.i, label %for.body49.i

the SLP vectorizer transforms into

for.body49.i: ; preds = %for.body49.i, %if.end22.thread.i %indvars.iv.i = phi i64 [ 40, %if.end22.thread.i ], [ %indvars.iv.next.i, %for.body49.i ] %lambda.0671.i = phi i32 [ 40, %if.end22.thread.i ], [ %inc413.i, %for.body49.i ] %Nc.0670.i = phi i16 [ 40, %if.end22.thread.i ], [ %spec.select660.i, %for.body49.i ] %L_max.0669.i = phi i64 [ 0, %if.end22.thread.i ], [ %spec.select659.i, %for.body49.i ] %57 = sub nsw i64 0, %indvars.iv.i %arrayidx54.i = getelementptr inbounds i16, i16 %dp, i64 %57 %58 = sub nsw i64 1, %indvars.iv.i %arrayidx61.i = getelementptr inbounds i16, i16 %dp, i64 %58 %59 = sub nsw i64 2, %indvars.iv.i %arrayidx69.i = getelementptr inbounds i16, i16 %dp, i64 %59 %60 = sub nsw i64 3, %indvars.iv.i %arrayidx78.i = getelementptr inbounds i16, i16 %dp, i64 %60 %61 = sub nsw i64 4, %indvars.iv.i %arrayidx87.i = getelementptr inbounds i16, i16 %dp, i64 %61 %62 = sub nsw i64 5, %indvars.iv.i %arrayidx96.i = getelementptr inbounds i16, i16 %dp, i64 %62 %63 = sub nsw i64 6, %indvars.iv.i %arrayidx105.i = getelementptr inbounds i16, i16 %dp, i64 %63 %64 = sub nsw i64 7, %indvars.iv.i %arrayidx114.i = getelementptr inbounds i16, i16 %dp, i64 %64 %65 = sub nsw i64 8, %indvars.iv.i %arrayidx123.i = getelementptr inbounds i16, i16 %dp, i64 %65 %66 = sub nsw i64 9, %indvars.iv.i %arrayidx132.i = getelementptr inbounds i16, i16 %dp, i64 %66 %67 = sub nsw i64 10, %indvars.iv.i %arrayidx141.i = getelementptr inbounds i16, i16 %dp, i64 %67 %68 = sub nsw i64 11, %indvars.iv.i %arrayidx150.i = getelementptr inbounds i16, i16 %dp, i64 %68 %69 = sub nsw i64 12, %indvars.iv.i %arrayidx159.i = getelementptr inbounds i16, i16 %dp, i64 %69 %70 = sub nsw i64 13, %indvars.iv.i %arrayidx168.i = getelementptr inbounds i16, i16 %dp, i64 %70 %71 = sub nsw i64 14, %indvars.iv.i %arrayidx177.i = getelementptr inbounds i16, i16 %dp, i64 %71 %72 = sub nsw i64 15, %indvars.iv.i %arrayidx186.i = getelementptr inbounds i16, i16 %dp, i64 %72 %73 = sub nsw i64 16, %indvars.iv.i %arrayidx195.i = getelementptr inbounds i16, i16 %dp, i64 %73 %74 = sub nsw i64 17, %indvars.iv.i %arrayidx204.i = getelementptr inbounds i16, i16 %dp, i64 %74 %75 = sub nsw i64 18, %indvars.iv.i %arrayidx213.i = getelementptr inbounds i16, i16 %dp, i64 %75 %76 = sub nsw i64 19, %indvars.iv.i %arrayidx222.i = getelementptr inbounds i16, i16 %dp, i64 %76 %77 = sub nsw i64 20, %indvars.iv.i %arrayidx231.i = getelementptr inbounds i16, i16 %dp, i64 %77 %78 = sub nsw i64 21, %indvars.iv.i %arrayidx240.i = getelementptr inbounds i16, i16 %dp, i64 %78 %79 = sub nsw i64 22, %indvars.iv.i %arrayidx249.i = getelementptr inbounds i16, i16 %dp, i64 %79 %80 = sub nsw i64 23, %indvars.iv.i %arrayidx258.i = getelementptr inbounds i16, i16 %dp, i64 %80 %81 = sub nsw i64 24, %indvars.iv.i %arrayidx267.i = getelementptr inbounds i16, i16 %dp, i64 %81 %82 = sub nsw i64 25, %indvars.iv.i %arrayidx276.i = getelementptr inbounds i16, i16 %dp, i64 %82 %83 = sub nsw i64 26, %indvars.iv.i %arrayidx285.i = getelementptr inbounds i16, i16 %dp, i64 %83 %84 = sub nsw i64 27, %indvars.iv.i %arrayidx294.i = getelementptr inbounds i16, i16 %dp, i64 %84 %85 = sub nsw i64 28, %indvars.iv.i %arrayidx303.i = getelementptr inbounds i16, i16 %dp, i64 %85 %86 = sub nsw i64 29, %indvars.iv.i %arrayidx312.i = getelementptr inbounds i16, i16 %dp, i64 %86 %87 = sub nsw i64 30, %indvars.iv.i %arrayidx321.i = getelementptr inbounds i16, i16 %dp, i64 %87 %88 = sub nsw i64 31, %indvars.iv.i %arrayidx330.i = getelementptr inbounds i16, i16 %dp, i64 %88 %89 = bitcast i16 %arrayidx54.i to <32 x i16> %90 = load <32 x i16>, <32 x i16> %89, align 2, !tbaa !​2 %91 = shufflevector <32 x i16> %90, <32 x i16> undef, <32 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> %92 = sext <32 x i16> %91 to <32 x i64> %93 = mul nsw <32 x i64> %54, %92 %add.i = add nsw i64 undef, undef %add73.i = add nsw i64 %add.i, undef %add82.i = add nsw i64 %add73.i, undef %add91.i = add nsw i64 %add82.i, undef %add100.i = add nsw i64 %add91.i, undef %add109.i = add nsw i64 %add100.i, undef %add118.i = add nsw i64 %add109.i, undef %add127.i = add nsw i64 %add118.i, undef %add136.i = add nsw i64 %add127.i, undef %add145.i = add nsw i64 %add136.i, undef %add154.i = add nsw i64 %add145.i, undef %add163.i = add nsw i64 %add154.i, undef %add172.i = add nsw i64 %add163.i, undef %add181.i = add nsw i64 %add172.i, undef %add190.i = add nsw i64 %add181.i, undef %add199.i = add nsw i64 %add190.i, undef %add208.i = add nsw i64 %add199.i, undef %add217.i = add nsw i64 %add208.i, undef %add226.i = add nsw i64 %add217.i, undef %add235.i = add nsw i64 %add226.i, undef %add244.i = add nsw i64 %add235.i, undef %add253.i = add nsw i64 %add244.i, undef %add262.i = add nsw i64 %add253.i, undef %add271.i = add nsw i64 %add262.i, undef %add280.i = add nsw i64 %add271.i, undef %add289.i = add nsw i64 %add280.i, undef %add298.i = add nsw i64 %add289.i, undef %add307.i = add nsw i64 %add298.i, undef %add316.i = add nsw i64 %add307.i, undef %add325.i = add nsw i64 %add316.i, undef %add334.i = add nsw i64 %add325.i, undef %94 = sub nsw i64 32, %indvars.iv.i %arrayidx339.i = getelementptr inbounds i16, i16 %dp, i64 %94 %95 = sub nsw i64 33, %indvars.iv.i %arrayidx348.i = getelementptr inbounds i16, i16 %dp, i64 %95 %96 = sub nsw i64 34, %indvars.iv.i %arrayidx357.i = getelementptr inbounds i16, i16 %dp, i64 %96 %97 = sub nsw i64 35, %indvars.iv.i %arrayidx366.i = getelementptr inbounds i16, i16 %dp, i64 %97 %98 = sub nsw i64 36, %indvars.iv.i %arrayidx375.i = getelementptr inbounds i16, i16 %dp, i64 %98 %99 = sub nsw i64 37, %indvars.iv.i %arrayidx384.i = getelementptr inbounds i16, i16 %dp, i64 %99 %100 = sub nsw i64 38, %indvars.iv.i %arrayidx393.i = getelementptr inbounds i16, i16 %dp, i64 %100 %101 = sub nsw i64 39, %indvars.iv.i %arrayidx402.i = getelementptr inbounds i16, i16 %dp, i64 %101 %102 = bitcast i16 %arrayidx339.i to <8 x i16> %103 = load <8 x i16>, <8 x i16> %102, align 2, !tbaa !​2 %104 = sext <8 x i16> %103 to <8 x i64> %105 = mul nsw <8 x i64> %56, %104 %add343.i = add nsw i64 %add334.i, undef %add352.i = add nsw i64 %add343.i, undef %add361.i = add nsw i64 %add352.i, undef %add370.i = add nsw i64 %add361.i, undef %add379.i = add nsw i64 %add370.i, undef %add388.i = add nsw i64 %add379.i, undef %add397.i = add nsw i64 %add388.i, undef %106 = call i64 @​llvm.experimental.vector.reduce.add.i64.v32i64(<32 x i64> %93) %107 = call i64 @​llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> %105) %op.rdx = add nsw i64 %106, %107 %add406.i = add nsw i64 %add397.i, undef %cmp407.i = icmp sgt i64 %op.rdx, %L_max.0669.i %conv410.i = trunc i32 %lambda.0671.i to i16 %spec.select659.i = select i1 %cmp407.i, i64 %op.rdx, i64 %L_max.0669.i %spec.select660.i = select i1 %cmp407.i, i16 %conv410.i, i16 %Nc.0670.i %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 %inc413.i = add nuw nsw i32 %lambda.0671.i, 1 %exitcond.i = icmp eq i64 %indvars.iv.next.i, 121 br i1 %exitcond.i, label %for.end414.i, label %for.body49.i

which is lowered into

ldp q29, q31, [x3,#-64]
sxtl2 v8.4s, v31.8h
sxtl v13.4s, v31.4h
sxtl2 v31.2d, v13.4s
ldp q30, q9, [x3,#-32]
sxtl2 v12.2d, v8.4s
sxtl v10.4s, v9.4h
sxtl2 v9.4s, v9.8h
mov x8, v12.d[1]
fmov x10, d12
sxtl2 v12.2d, v9.4s
sxtl2 v11.4s, v30.8h
sxtl v30.4s, v30.4h
tbl v29.16b, {v29.16b}, v0.16b
sxtl v9.2d, v9.2s
fmov x12, d12
fmov x13, d31
fmov x16, d6
sxtl v8.2d, v8.2s
fmov x11, d4
fmov x15, d1
fmov x18, d8
fmov x26, d22
mul x19, x16, x13
fmov x16, d7
mul x0, x11, x10
mov x13, v9.d[1]
str x8, [sp,#264]
mov x8, v12.d[1]
sxtl2 v12.2d, v11.4s
sxtl v11.2d, v11.2s
mov x11, v8.d[1]
mul x15, x15, x12
fmov x12, d5
mul x2, x16, x18
fmov x16, d11
fmov x18, d20
str x8, [sp,#256]
mov x8, v12.d[1]
mul x18, x18, x16
fmov x16, d21
mov x17, v11.d[1]
str x8, [sp,#248]
fmov x8, d12
sxtl v12.2d, v13.2s
mov x4, v12.d[1]
str x8, [sp,#240]
mov x8, v31.d[1]
sxtl2 v31.2d, v10.4s
sxtl v10.2d, v10.2s
mov x22, v31.d[1]
fmov x21, d31
sxtl v31.2d, v30.2s
sxtl2 v30.2d, v30.4s
mov x7, v10.d[1]
fmov x14, d30
mov x20, v30.d[1]
sxtl v30.4s, v29.4h
sxtl2 v29.4s, v29.8h
sxtl2 v13.2d, v29.4s
sxtl v8.2d, v30.2s
mov x27, v31.d[1]
sxtl2 v11.2d, v30.4s
fmov d30, x15
mul x12, x12, x14
fmov x14, d9
sxtl v9.2d, v29.2s
fmov d29, x0
fmov x0, d12
fmov x28, d31
ldr q31, [x3]
str x8, [sp,#232]
fmov x1, d10
fmov x8, d24
mul x26, x26, x1
fmov x1, d11
mov x15, v13.d[1]
mov x30, v9.d[1]
fmov x9, d9
sxtl v9.4s, v31.4h
sxtl2 v31.4s, v31.8h
mul x8, x8, x28
fmov x24, d8
fmov d10, x2
sub x3, x3, #&#8203;0x2
mul x5, x16, x0
ldr x23, [sp,#128]
fmov x16, d13
mov x0, v11.d[1]
sxtl v12.2d, v31.2s
sxtl2 v31.2d, v31.4s
mov x2, v12.d[1]
fmov x10, d12
fmov d12, x26
fmov d11, x5
mov x5, v8.d[1]
sxtl v8.2d, v9.2s
mul x4, x23, x4
ldr x23, [sp,#120]
mul x7, x23, x7
mov x26, v8.d[1]
fmov x28, d8
fmov d8, x8
fmov x8, d23
mul x8, x8, x24
ldr x23, [sp,#112]
mul x5, x23, x5
ldr x23, [sp,#104]
mov x24, v31.d[1]
mul x27, x23, x27
mov v11.d[1], x4
fmov x4, d31
mov v12.d[1], x7
fmov d31, x8
mov v8.d[1], x27
sxtl2 v9.2d, v9.4s
mov v31.d[1], x5
add v31.2d, v31.2d, v8.2d
fmov d8, x18
fmov x18, d19
mul x9, x18, x9
mov x7, v9.d[1]
fmov x8, d9
add v9.2d, v11.2d, v12.2d
ldr x18, [sp,#136]
add v31.2d, v31.2d, v9.2d
mul x17, x18, x17
fmov d9, x9
ldr x9, [sp,#144]
mul x9, x9, x30
mov v8.d[1], x17
fmov x17, d18
mov v9.d[1], x9
mul x9, x17, x14
add v8.2d, v9.2d, v8.2d
fmov d9, x9
ldr x9, [sp,#176]
mul x9, x9, x11
ldr x11, [sp,#184]
mul x11, x11, x20
mov v10.d[1], x9
ldr x9, [sp,#152]
mul x9, x9, x13
ldr x13, [sp,#232]
mov v9.d[1], x9
fmov x9, d17
mul x9, x9, x1
add v9.2d, v10.2d, v9.2d
fmov d10, x12
ldr x12, [sp,#200]
add v8.2d, v8.2d, v9.2d
fmov d9, x9
ldr x9, [sp,#160]
mul x12, x12, x13
add v31.2d, v31.2d, v8.2d
mov v10.d[1], x11
fmov x11, d16
mul x9, x9, x0
mul x11, x11, x16
mov v9.d[1], x9
fmov x9, d2
mul x9, x9, x21
add v9.2d, v9.2d, v10.2d
fmov d10, x19
fmov d8, x9
fmov x9, d3
mov v10.d[1], x12
ldr x12, [sp,#192]
mul x12, x12, x22
mov v8.d[1], x12
ldr x12, [sp,#240]
mul x9, x9, x12
ldr x12, [sp,#248]
add v8.2d, v10.2d, v8.2d
fmov d10, x9
ldr x9, [sp,#208]
add v8.2d, v9.2d, v8.2d
fmov d9, x11
fmov x11, d25
mul x9, x9, x12
mov v10.d[1], x9
ldr x9, [sp,#168]
mul x9, x9, x15
mov v9.d[1], x9
mul x9, x11, x10
ldr x10, [sp,#224]
ldp x12, x11, [sp,#256]
mul x10, x10, x11
ldr x11, [sp,#216]
mul x11, x11, x12
mov v29.d[1], x10
add v9.2d, v9.2d, v10.2d
mov v30.d[1], x11
fmov d10, x9
fmov x9, d27
fmov x10, d26
mul x9, x9, x28
mul x10, x10, x4
add v29.2d, v29.2d, v30.2d
fmov d30, x9
add v29.2d, v9.2d, v29.2d
fmov d9, x10
add v29.2d, v8.2d, v29.2d
ldp x10, x9, [sp,#88]
mul x9, x9, x2
mov v10.d[1], x9
ldr x9, [sp,#80]
mul x9, x9, x24
mov v9.d[1], x9
fmov x9, d28
mul x10, x10, x26
mul x8, x9, x8
mov v30.d[1], x10
ldr x10, [sp,#72]
mul x10, x10, x7
add v30.2d, v30.2d, v10.2d
fmov d8, x8
mov v8.d[1], x10
add w10, w29, #&#8203;0x79
add x29, x29, #&#8203;0x1
add v8.2d, v8.2d, v9.2d
add v29.2d, v31.2d, v29.2d
addp d29, v29.2d
fmov x8, d29
add v30.2d, v30.2d, v8.2d
addp d29, v30.2d
fmov x9, d29
add x8, x8, x9
cmp x8, x25
csel x25, x8, x25, gt
csel w6, w10, w6, gt
cbnz x29, 406938 <Gsm_Long_Term_Predictor+0x2b8>

When the optimisation is not applied the lowered code is

ldursh x4, [x2,#-40]
ldr x6, [sp,#200]
mul x4, x6, x4
ldursh x5, [x2,#-38]
ldr x6, [sp,#192]
madd x4, x6, x5, x4
ldursh x5, [x2,#-36]
ldp x7, x6, [sp,#176]
madd x4, x6, x5, x4
ldursh x5, [x2,#-34]
ldursh x6, [x2,#-32]
madd x4, x7, x5, x4
ldr x5, [sp,#168]
madd x4, x5, x6, x4
ldursh x5, [x2,#-30]
ldp x7, x6, [sp,#152]
madd x4, x6, x5, x4
ldursh x5, [x2,#-28]
ldursh x6, [x2,#-26]
madd x4, x7, x5, x4
ldr x5, [sp,#144]
madd x4, x5, x6, x4
ldursh x5, [x2,#-24]
ldp x7, x6, [sp,#128]
madd x4, x6, x5, x4
ldursh x5, [x2,#-22]
ldursh x6, [x2,#-20]
madd x4, x7, x5, x4
ldr x5, [sp,#120]
madd x4, x5, x6, x4
ldursh x5, [x2,#-18]
ldp x7, x6, [sp,#104]
madd x4, x6, x5, x4
ldursh x5, [x2,#-16]
ldursh x6, [x2,#-14]
madd x4, x7, x5, x4
ldr x5, [sp,#96]
madd x4, x5, x6, x4
ldursh x5, [x2,#-12]
ldp x7, x6, [sp,#80]
madd x4, x6, x5, x4
ldursh x5, [x2,#-10]
ldursh x6, [x2,#-8]
madd x4, x7, x5, x4
ldr x5, [sp,#72]
madd x4, x5, x6, x4
ldursh x5, [x2,#-6]
ldr x6, [sp,#64]
madd x4, x6, x5, x4
ldursh x5, [x2,#-4]
ldrsh x6, [x3,#-2]!
madd x4, x26, x5, x4
ldrsh x5, [x2]
madd x4, x27, x6, x4
madd x4, x28, x5, x4
ldrsh x5, [x2,#2]
madd x4, x29, x5, x4
ldrsh x5, [x2,#4]
madd x4, x30, x5, x4
ldrsh x5, [x2,#6]
madd x4, x24, x5, x4
ldrsh x5, [x2,#8]
madd x4, x23, x5, x4
ldrsh x5, [x2,#10]
madd x4, x25, x5, x4
ldrsh x5, [x2,#12]
madd x4, x19, x5, x4
ldrsh x5, [x2,#14]
madd x4, x20, x5, x4
ldrsh x5, [x2,#16]
madd x4, x22, x5, x4
ldrsh x5, [x2,#18]
madd x4, x21, x5, x4
ldrsh x5, [x2,#20]
madd x4, x10, x5, x4
ldrsh x5, [x2,#22]
madd x4, x11, x5, x4
ldrsh x5, [x2,#24]
madd x4, x12, x5, x4
ldrsh x5, [x2,#26]
madd x4, x13, x5, x4
ldrsh x5, [x2,#28]
madd x4, x14, x5, x4
ldrsh x5, [x2,#30]
madd x4, x15, x5, x4
ldrsh x5, [x2,#32]
madd x4, x16, x5, x4
ldrsh x5, [x2,#34]
madd x4, x17, x5, x4
ldrsh x5, [x2,#36]
ldrsh x2, [x2,#38]
madd x4, x18, x5, x4
madd x2, x0, x2, x4
cmp x2, x8
csel x8, x2, x8, gt
add w2, w1, #&#8203;0x79
csel w9, w2, w9, gt
mov x2, x3
adds x1, x1, #&#8203;0x1
b.ne 4069b0 <Gsm_Long_Term_Predictor+0x330>
eastig commented 6 years ago

Hi Eli,

AArch64 i16 is converted to i64 but in Thumb2 i16 is converted to i32.

It looks like that's a source-code issue? The code is using "long" instead of a fixed-width type. Not really anything we can do about that.

You are right. This is the LP64 curse.


I would guess this is some sort of cost-model problem, specifically underestimating the cost of "mul nsw <8 x i64> %56, %104" -debug output should clarify what's happening.

I'll run with '-debug' to get the cost. BTW, Shahid has submitted https://reviews.llvm.org/D41324 which adjusts the cost.

efriedma-quic commented 6 years ago

AArch64 i16 is converted to i64 but in Thumb2 i16 is converted to i32.

It looks like that's a source-code issue? The code is using "long" instead of a fixed-width type. Not really anything we can do about that.


I would guess this is some sort of cost-model problem, specifically underestimating the cost of "mul nsw <8 x i64> %56, %104" -debug output should clarify what's happening.

eastig commented 6 years ago

Thumb2 versions of the benchmarks have got ~8% performance improvement of Gsm_Long_Term_Predictor:

http://lnt.llvm.org/db_default/v4/nts/profile/1604450/83635/83625 http://lnt.llvm.org/db_default/v4/nts/profile/1604573/83635/83625

The vectorised version of the main loop is lowered to less instructions.

A difference between AArch64 and Thumb2 IRs coming to the SLP Vectorizer is that in AArch64 i16 is converted to i64 but in Thumb2 i16 is converted to i32.

On our boards we see ~20-22% performance improvement in the benchmarks.

A command to reproduce:

clang -DNDEBUG -I/work/llvm-test-suite/MultiSource/Benchmarks/mediabench/gsm/toast -O3 -DNDEBUG -mcpu =cortex-a57 -fomit-frame-pointer "" -w -Werror=date-time -DNeedFunctionPrototypes=1 -DSASR -c /work/llvm-test-suite/MultiSource/Benchmarks/mediabench/gsm/toast/long_term.c -mthumb -mllvm -print-after-all -mllvm --filter-print-funcs=Gsm_Long_Term_Predictor

The SLP vectorizer result for the Thumb2 version:

for.body47.i: ; preds = %for.body47.i, %if.end31.i %lambda.0579.i = phi i32 [ 40, %if.end31.i ], [ %inc331.i, %for.body47.i ] %Nc.0578.i = phi i16 [ 40, %if.end31.i ], [ %spec.select571.i, %for.body47.i ] %L_max.0577.i = phi i32 [ 0, %if.end31.i ], [ %spec.select570.i, %for.body47.i ] %sub50.i = sub nsw i32 0, %lambda.0579.i %arrayidx51.i = getelementptr inbounds i16, i16 %dp, i32 %sub50.i %sub55.i = sub nsw i32 1, %lambda.0579.i %arrayidx56.i = getelementptr inbounds i16, i16 %dp, i32 %sub55.i %sub61.i = sub nsw i32 2, %lambda.0579.i %arrayidx62.i = getelementptr inbounds i16, i16 %dp, i32 %sub61.i %sub68.i = sub nsw i32 3, %lambda.0579.i %arrayidx69.i = getelementptr inbounds i16, i16 %dp, i32 %sub68.i %sub75.i = sub nsw i32 4, %lambda.0579.i %arrayidx76.i = getelementptr inbounds i16, i16 %dp, i32 %sub75.i %sub82.i = sub nsw i32 5, %lambda.0579.i %arrayidx83.i = getelementptr inbounds i16, i16 %dp, i32 %sub82.i %sub89.i = sub nsw i32 6, %lambda.0579.i %arrayidx90.i = getelementptr inbounds i16, i16 %dp, i32 %sub89.i %sub96.i = sub nsw i32 7, %lambda.0579.i %arrayidx97.i = getelementptr inbounds i16, i16 %dp, i32 %sub96.i %sub103.i = sub nsw i32 8, %lambda.0579.i %arrayidx104.i = getelementptr inbounds i16, i16 %dp, i32 %sub103.i %sub110.i = sub nsw i32 9, %lambda.0579.i %arrayidx111.i = getelementptr inbounds i16, i16 %dp, i32 %sub110.i %sub117.i = sub nsw i32 10, %lambda.0579.i %arrayidx118.i = getelementptr inbounds i16, i16 %dp, i32 %sub117.i %sub124.i = sub nsw i32 11, %lambda.0579.i %arrayidx125.i = getelementptr inbounds i16, i16 %dp, i32 %sub124.i %sub131.i = sub nsw i32 12, %lambda.0579.i %arrayidx132.i = getelementptr inbounds i16, i16 %dp, i32 %sub131.i %sub138.i = sub nsw i32 13, %lambda.0579.i %arrayidx139.i = getelementptr inbounds i16, i16 %dp, i32 %sub138.i %sub145.i = sub nsw i32 14, %lambda.0579.i %arrayidx146.i = getelementptr inbounds i16, i16 %dp, i32 %sub145.i %sub152.i = sub nsw i32 15, %lambda.0579.i %arrayidx153.i = getelementptr inbounds i16, i16 %dp, i32 %sub152.i %sub159.i = sub nsw i32 16, %lambda.0579.i %arrayidx160.i = getelementptr inbounds i16, i16 %dp, i32 %sub159.i %sub166.i = sub nsw i32 17, %lambda.0579.i %arrayidx167.i = getelementptr inbounds i16, i16 %dp, i32 %sub166.i %sub173.i = sub nsw i32 18, %lambda.0579.i %arrayidx174.i = getelementptr inbounds i16, i16 %dp, i32 %sub173.i %sub180.i = sub nsw i32 19, %lambda.0579.i %arrayidx181.i = getelementptr inbounds i16, i16 %dp, i32 %sub180.i %sub187.i = sub nsw i32 20, %lambda.0579.i %arrayidx188.i = getelementptr inbounds i16, i16 %dp, i32 %sub187.i %sub194.i = sub nsw i32 21, %lambda.0579.i %arrayidx195.i = getelementptr inbounds i16, i16 %dp, i32 %sub194.i %sub201.i = sub nsw i32 22, %lambda.0579.i %arrayidx202.i = getelementptr inbounds i16, i16 %dp, i32 %sub201.i %sub208.i = sub nsw i32 23, %lambda.0579.i %arrayidx209.i = getelementptr inbounds i16, i16 %dp, i32 %sub208.i %sub215.i = sub nsw i32 24, %lambda.0579.i %arrayidx216.i = getelementptr inbounds i16, i16 %dp, i32 %sub215.i %sub222.i = sub nsw i32 25, %lambda.0579.i %arrayidx223.i = getelementptr inbounds i16, i16 %dp, i32 %sub222.i %sub229.i = sub nsw i32 26, %lambda.0579.i %arrayidx230.i = getelementptr inbounds i16, i16 %dp, i32 %sub229.i %sub236.i = sub nsw i32 27, %lambda.0579.i %arrayidx237.i = getelementptr inbounds i16, i16 %dp, i32 %sub236.i %sub243.i = sub nsw i32 28, %lambda.0579.i %arrayidx244.i = getelementptr inbounds i16, i16 %dp, i32 %sub243.i %sub250.i = sub nsw i32 29, %lambda.0579.i %arrayidx251.i = getelementptr inbounds i16, i16 %dp, i32 %sub250.i %sub257.i = sub nsw i32 30, %lambda.0579.i %arrayidx258.i = getelementptr inbounds i16, i16 %dp, i32 %sub257.i %sub264.i = sub nsw i32 31, %lambda.0579.i %arrayidx265.i = getelementptr inbounds i16, i16 %dp, i32 %sub264.i %50 = bitcast i16 %arrayidx51.i to <32 x i16> %51 = load <32 x i16>, <32 x i16> %50, align 2, !tbaa !​3 %52 = shufflevector <32 x i16> %51, <32 x i16> undef, <32 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> %53 = sext <32 x i16> %52 to <32 x i32> %54 = mul nsw <32 x i32> %37, %53 %add.i = add nsw i32 undef, undef %add65.i = add nsw i32 %add.i, undef %add72.i = add nsw i32 %add65.i, undef %add79.i = add nsw i32 %add72.i, undef %add86.i = add nsw i32 %add79.i, undef %add93.i = add nsw i32 %add86.i, undef %add100.i = add nsw i32 %add93.i, undef %add107.i = add nsw i32 %add100.i, undef %add114.i = add nsw i32 %add107.i, undef %add121.i = add nsw i32 %add114.i, undef %add128.i = add nsw i32 %add121.i, undef %add135.i = add nsw i32 %add128.i, undef %add142.i = add nsw i32 %add135.i, undef %add149.i = add nsw i32 %add142.i, undef %add156.i = add nsw i32 %add149.i, undef %add163.i = add nsw i32 %add156.i, undef %add170.i = add nsw i32 %add163.i, undef %add177.i = add nsw i32 %add170.i, undef %add184.i = add nsw i32 %add177.i, undef %add191.i = add nsw i32 %add184.i, undef %add198.i = add nsw i32 %add191.i, undef %add205.i = add nsw i32 %add198.i, undef %add212.i = add nsw i32 %add205.i, undef %add219.i = add nsw i32 %add212.i, undef %add226.i = add nsw i32 %add219.i, undef %add233.i = add nsw i32 %add226.i, undef %add240.i = add nsw i32 %add233.i, undef %add247.i = add nsw i32 %add240.i, undef %add254.i = add nsw i32 %add247.i, undef %add261.i = add nsw i32 %add254.i, undef %add268.i = add nsw i32 %add261.i, undef %sub271.i = sub nsw i32 32, %lambda.0579.i %arrayidx272.i = getelementptr inbounds i16, i16 %dp, i32 %sub271.i %sub278.i = sub nsw i32 33, %lambda.0579.i %arrayidx279.i = getelementptr inbounds i16, i16 %dp, i32 %sub278.i %sub285.i = sub nsw i32 34, %lambda.0579.i %arrayidx286.i = getelementptr inbounds i16, i16 %dp, i32 %sub285.i %sub292.i = sub nsw i32 35, %lambda.0579.i %arrayidx293.i = getelementptr inbounds i16, i16 %dp, i32 %sub292.i %sub299.i = sub nsw i32 36, %lambda.0579.i %arrayidx300.i = getelementptr inbounds i16, i16 %dp, i32 %sub299.i %sub306.i = sub nsw i32 37, %lambda.0579.i %arrayidx307.i = getelementptr inbounds i16, i16 %dp, i32 %sub306.i %sub313.i = sub nsw i32 38, %lambda.0579.i %arrayidx314.i = getelementptr inbounds i16, i16 %dp, i32 %sub313.i %sub320.i = sub nsw i32 39, %lambda.0579.i %arrayidx321.i = getelementptr inbounds i16, i16 %dp, i32 %sub320.i %55 = bitcast i16 %arrayidx272.i to <8 x i16> %56 = load <8 x i16>, <8 x i16> %55, align 2, !tbaa !​3 %57 = sext <8 x i16> %56 to <8 x i32> %58 = mul nsw <8 x i32> %49, %57 %add275.i = add nsw i32 %add268.i, undef %add282.i = add nsw i32 %add275.i, undef %add289.i = add nsw i32 %add282.i, undef %add296.i = add nsw i32 %add289.i, undef %add303.i = add nsw i32 %add296.i, undef %add310.i = add nsw i32 %add303.i, undef %add317.i = add nsw i32 %add310.i, undef %rdx.shuf = shufflevector <32 x i32> %54, <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx = add nsw <32 x i32> %54, %rdx.shuf %rdx.shuf239 = shufflevector <32 x i32> %bin.rdx, <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx240 = add nsw <32 x i32> %bin.rdx, %rdx.shuf239 %rdx.shuf241 = shufflevector <32 x i32> %bin.rdx240, <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx242 = add nsw <32 x i32> %bin.rdx240, %rdx.shuf241 %rdx.shuf243 = shufflevector <32 x i32> %bin.rdx242, <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx244 = add nsw <32 x i32> %bin.rdx242, %rdx.shuf243 %rdx.shuf245 = shufflevector <32 x i32> %bin.rdx244, <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx246 = add nsw <32 x i32> %bin.rdx244, %rdx.shuf245 %59 = extractelement <32 x i32> %bin.rdx246, i32 0 %rdx.shuf247 = shufflevector <8 x i32> %58, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx248 = add nsw <8 x i32> %58, %rdx.shuf247 %rdx.shuf249 = shufflevector <8 x i32> %bin.rdx248, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx250 = add nsw <8 x i32> %bin.rdx248, %rdx.shuf249 %rdx.shuf251 = shufflevector <8 x i32> %bin.rdx250, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> %bin.rdx252 = add nsw <8 x i32> %bin.rdx250, %rdx.shuf251 %60 = extractelement <8 x i32> %bin.rdx252, i32 0 %op.rdx = add nsw i32 %59, %60 %add324.i = add nsw i32 %add317.i, undef %cmp325.i = icmp sgt i32 %op.rdx, %L_max.0577.i %conv328.i = trunc i32 %lambda.0579.i to i16 %spec.select570.i = select i1 %cmp325.i, i32 %op.rdx, i32 %L_max.0577.i %spec.select571.i = select i1 %cmp325.i, i16 %conv328.i, i16 %Nc.0578.i %inc331.i = add nuw nsw i32 %lambda.0579.i, 1 %exitcond.i = icmp eq i32 %inc331.i, 121 br i1 %exitcond.i, label %for.end332.i, label %for.body47.i