NICTA / scoobi

A Scala productivity framework for Hadoop.
http://nicta.github.com/scoobi/
482 stars 97 forks source link

toPartitionedTextFile is creating too many partitions #320

Closed raronson closed 10 years ago

raronson commented 10 years ago

test case:

import org.specs2._
import com.nicta.scoobi.Scoobi._
import com.nicta.scoobi.testing.mutable._
import com.nicta.scoobi.testing.TestFiles._
import org.specs2.matcher.FileMatchers
import com.nicta.scoobi.testing.{TempFiles, SimpleJobs}
import java.io.File

class Testcase extends HadoopSpecification with SimpleJobs with FileMatchers {
  override def isCluster = false

  "Too many partitions get created" >> { implicit sc: ScoobiConfiguration =>

    val base = path(TempFiles.createTempDir("tmp").getPath).pp
    val partitions = base + "/partitions"
    val dlist = DList(("a/b/c", 1), ("a/b/c", 2), ("a/b/c", 3))

    dlist.toPartitionedTextFile(partitions, identity).persist

    (new File(partitions)).list.toList must_== List("a")
  }
}

The following directory structure was created:

/var/folders/6h/963cvjm52bq3s809wmz38d300000gp/T/tmp3625713493852866692
└── partitions
    ├── 5
    │   └── a
    │       └── b
    │           └── c
    ├── a
    │   └── b
    │       └── c
    │           └── ch4out5-m-00001
    └── var
        └── folders
            └── 6h
                └── 963cvjm52bq3s809wmz38d300000gp
                    └── T
                        └── tmp3625713493852866692
                            └── partitions
                                └── _SUCCESS