Constructing an iterator from a slice or Vec doesn't optimise completely

huonw commented 10 years ago

#![crate_type = "lib"]

pub fn slice(s: &[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}
pub fn vec(s: Vec<uint>) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

pub fn owned(s: ~[uint]) -> uint {
    for &j in s.iter() {
        if j > 10 { return j }
    }
    0
}

Compiled with -O --lib --emit-llvm -S gives the following. The only major difference between &[]/Vec and ~[] are two lines marked THIS CHECK, which, we think, is because when constructing an iterator from ~[] we do a pointer offset and dereference, so LLVM knows the pointers are non-null (in the slice/Vec case, the match it.next() { None => ... } part of the for loop isn't removed).

; ModuleID = '11751.rs'
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"struct.std::vec::Vec<uint>[#1]" = type { i64, i64, i64* }

; Function Attrs: nounwind readonly uwtable
define i64 @_ZN5slice20h084d58a6edab0287daa4v0.0E({ i64*, i64 }* noalias nocapture nonnull readonly) unnamed_addr #0 {
entry-block:
  %1 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 0
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds { i64*, i64 }* %0, i64 0, i32 1
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %match_else, %entry-block
  %6 = phi i64* [ %9, %match_else ], [ %2, %entry-block ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null     ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %return, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %return, label %loop_body

return:                                           ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  ret i64 %__make_return_pointer.0
}

; Function Attrs: uwtable
define i64 @_ZN3vec20h4963a1d1a9f58c9eUaa4v0.0E(%"struct.std::vec::Vec<uint>[#1]"* noalias nocapture nonnull readonly) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 2
  %2 = load i64** %1, align 8
  %3 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 0
  %4 = load i64* %3, align 8
  %5 = getelementptr inbounds i64* %2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %2, %entry-block ], [ %9, %match_else ]
  %7 = icmp eq i64* %6, %5
  %8 = icmp eq i64* %6, null      ; THIS CHECK!
  %or.cond = or i1 %7, %8
  br i1 %or.cond, label %clean_custom_6, label %match_else

match_else:                                       ; preds = %loop_body
  %9 = getelementptr inbounds i64* %6, i64 1
  %10 = load i64* %6, align 8
  %11 = icmp ugt i64 %10, 10
  br i1 %11, label %clean_custom_6, label %loop_body

clean_custom_6:                                   ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %10, %match_else ], [ 0, %loop_body ]
  %12 = getelementptr inbounds %"struct.std::vec::Vec<uint>[#1]"* %0, i64 0, i32 1
  %13 = load i64* %12, align 8
  %14 = icmp eq i64 %13, 0
  br i1 %14, label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit", label %then-block-549-.i.i

then-block-549-.i.i:                              ; preds = %clean_custom_6
  %15 = bitcast i64* %2 to i8*
  tail call void @je_dallocx(i8* %15, i32 3)
  br label %"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit"

"_ZN25std..vec..Vec$LT$uint$GT$14glue_drop.115917h10684057aba082a7E.exit": ; preds = %clean_custom_6, %then-block-549-.i.i
  ret i64 %__make_return_pointer.0
}

declare void @je_dallocx(i8*, i32) unnamed_addr #2

; Function Attrs: uwtable
define i64 @_ZN5owned20h3f7b4426165c9e96Bba4v0.0E({ i64, i64, [0 x i64] }* noalias nonnull) unnamed_addr #1 {
entry-block:
  %1 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 0
  %2 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 0
  %3 = load i64* %2, align 8
  %4 = lshr i64 %3, 3
  %5 = getelementptr inbounds { i64, i64, [0 x i64] }* %0, i64 0, i32 2, i64 %4
  br label %loop_body

loop_body:                                        ; preds = %entry-block, %match_else
  %6 = phi i64* [ %1, %entry-block ], [ %8, %match_else ]
  %7 = icmp eq i64* %6, %5
  br i1 %7, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %match_else

match_else:                                       ; preds = %loop_body
  %8 = getelementptr inbounds i64* %6, i64 1
  %9 = load i64* %6, align 8
  %10 = icmp ugt i64 %9, 10
  br i1 %10, label %"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit", label %loop_body

"_ZN17_$UP$$x5buint$x5d14glue_drop.120017hf14aae96f6d219c9E.exit": ; preds = %loop_body, %match_else
  %__make_return_pointer.0 = phi i64 [ %9, %match_else ], [ 0, %loop_body ]
  %11 = bitcast { i64, i64, [0 x i64] }* %0 to i8*
  tail call void @je_dallocx(i8* %11, i32 3)
  ret i64 %__make_return_pointer.0
}

attributes #0 = { nounwind readonly uwtable "split-stack" }
attributes #1 = { uwtable "split-stack" }
attributes #2 = { "split-stack" }

thestinger commented 10 years ago

Implementing #9546 would fix this.

zwarich commented 10 years ago

In theory, LLVM should be able to determine that this null check is unnecessary without additional metadata. There are two separate changes to LLVM's optimizer that are required:

An inbounds GEP either produces a valid pointer into an allocated object or a poison value. In address space 0, there is no allocated object at the zero address. This implies that any inbounds GEP in address space 0 is a poison value. According to the LLVM LangRef, the icmp would depend on the poison value, and any instruction that depends on poison values exhibits undefined behavior. However, Dan Gohman (sunfish) tells me that it was only intended to apply to instructions exhibiting externally visible side effects, as otherwise it would mean that any add instruction could potentially have undefined behavior. Any chain of inbounds GEPs and phis ending with a load of the inbounds value should be undefined behavior, because of how poison values behave like undef. We can't just assume that an inbounds GEP produces a nonnull value, because then that implies that inbounds GEPs themselves can have undefined behavior when the runtime value is actually null. Lots of optimizations depend on being able to hoist GEPs, e.g. out of loops, and that wouldn't be possible if they potentially had undefined behavior.
The current LazyValueInfo / CorrelatedValuePropagation passes are not optimistic with respect to control flow. Since there is a phi here, the optimization opportunity would be missed even if LazyValueInfo understood that the pointers are null. This would improve other optimizations as well, but it would probably hurt compile-time a bit. IIRC changes along these lines have been proposed for LLVM in the past, but they have never gone in.

Correctly implementing the rule for poison values in LazyValueInfo would be quite difficult, because it requires reasoning about control-dependence with respect to poison values. Also, the cost of making LazyValueInfo optimistic might be too high in compile time to get the patch landed.

In #9546 there is a proposal to add metadata on LLVM instructions that indicates that the instruction produces a nonnull value. There are two reasons why this proposal would be a bit more difficult than it seems at first:

If an instruction is marked with the nonnull metadata, then what happens if the value actually isn't null at runtime? Is it undefined behavior, or is it a poison value? If it is undefined behavior, then this means that any instruction with the nonnull metadata would potentially have side effects, and code motion that modifies control dependence of this instruction would be prohibited. Dan and I realized that this is also a problem with the current range metadata in LLVM that has likely gone unnoticed. It isn't a problem with the existing nonnull attribute on function parameters and return values, because the attribute is erased upon inlining.
The nonnull metadata would only tell you that the inputs to the phi are nonnull, but LazyValueInfo would still not be able to propagate that to the phi because of the inadequacy mentioned above. Another pass would have to propagate nonnull on phis.

Another option would be to write a pass that looks for chains of inbounds GEPs, nonnull parameters / return values, and phis of these that feed into loads and stores control-dependent on null checks of some intermediate value in the chain. The null checks could be replaced with false, and then hopefully other optimization passes would be able to clean everything up.

pcwalton commented 10 years ago

Sounds like the last option is the easiest. I also like the fact that it's a separate pass, meaning that if we have trouble getting it upstream we can maintain it in our branch for a while.

zwarich commented 10 years ago

I wrote the optimization pass I described. It is able to optimize the first case (with &[uint]), but it is unable to optimize the second case (with Vec<uint>). I have an informal inductive argument for why it should still always be nonnull with LLVM IR's poison value semantics, but implementing it as code will be trickier.

zwarich commented 10 years ago

Unfortunately, my pass causes the compiled rustc to segfault when compiling liblibc. That might be a pain to track down. The problem could be in my code, or rustc could just be marking a GEP inbounds when it shouldn't.

zwarich commented 10 years ago

I found the issue and put a first cut of my pass up as https://github.com/rust-lang/llvm/pull/13.

brson commented 10 years ago

\o/ On Jun 28, 2014 12:24 AM, "Cameron Zwarich" notifications@github.com wrote:

I found the issue and put a first cut of my pass up as rust-lang/llvm#13 https://github.com/rust-lang/llvm/pull/13.

— Reply to this email directly or view it on GitHub https://github.com/rust-lang/rust/issues/11751#issuecomment-47420627.

zwarich commented 10 years ago

The pass that was landed handles the &[T] case. There are two obvious remaining things to do:

Generalize to the case of multiple null checks involved in a single branch. This shouldn't be too hard.
Handle the case where the base pointer is not known to be nonnull, but everything else in the recurrence is an inbounds GEP. This corresponds to the Vec<T> case. In theory, we could add nonnull metadata to the load (since loads are allowed to have undefined behavior), but applying the LLVM rules inductively actually lets the checks be removed without that.

zwarich commented 10 years ago

Actually, that second point applies to the IR generated by &[T] as well. I must've oversimplified it when I made test cases. I'll try to solve that soon then.

zwarich commented 10 years ago

I have local changes that fix those two items. In order to make this apply to zip(), I also need to track dominating conditions from other blocks. This shouldn't be that much more work, so maybe I'll hold out until I have that working.

zwarich commented 10 years ago

Those changes are up as https://github.com/rust-lang/llvm/pull/14/. I'll need to add less conservative control dependence checking, and then it should be able to handle arbitrary chaining of zipped iterators.

rust-lang / rust

Constructing an iterator from a slice or Vec doesn't optimise completely #11751