NICTA / scoobi

A Scala productivity framework for Hadoop.
http://nicta.github.com/scoobi/
481 stars 97 forks source link

DList concat causes no data to be output #328

Closed raronson closed 10 years ago

raronson commented 10 years ago

When concatenating a DList created before a GBK with one created after the GBK, scoobi doens't output any data. Example spec below

import org.specs2._
import com.nicta.scoobi.Scoobi._
import com.nicta.scoobi.testing.mutable._
import com.nicta.scoobi.testing.SimpleJobs

class GraphBugSpec extends HadoopSpecification with SimpleJobs {
  override def isCluster = false

  "Concat breaks graph" >> { implicit sc: ScoobiConfiguration =>

    val init = DList(1, 2, 3, 2)

    val validated: DList[Either[String, Int]] = init.map(i => if(i < 2) Left("number too low") else Right(i))

    val errs: DList[String] = validated.collect {
      case Left(e) => e
    }

    val good: DList[Int] = validated.collect {
      case Right(i) => i
    }

    val secondVal: DList[Either[String, Int]] = good.groupBy(identity).collect {
      case (k, vs) if vs.size > 1 => Right(k)
      case (k, vs)                => Left(s"Too few entries in group $k")
    }

    val secondErrs = secondVal.collect {
      case Left(e) => e
    }

    val secondGood = secondVal.collect {
      case Right(i) => i
    }

    // This works
    //persist(errs, secondErrs, secondGood)
    persist((errs ++ secondErrs), secondGood)
    secondGood.run.toList must_== List(2) // fails because list is empty
  }
}