irvingc / dbscan-on-spark

An implementation of DBSCAN runing on top of Apache Spark
Apache License 2.0
183 stars 58 forks source link

Unable to implement dbscan in spark. #14

Open BhaveshMandalkar opened 5 years ago

BhaveshMandalkar commented 5 years ago

I am trying to implement dbscan in spark but I am facing lots of issues. Can anyone please tell me how to implement dbscan in spark-scala. Please help me. I mention the Main class and POM.xml file below. Please tell me if any version is wrong or any dependency is missing.

This is My main class:

package DBScanClustering

import org.apache.spark.mllib.clustering.dbscan.DBSCAN import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.mllib.linalg.Vectors

object MyMain {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("DBSCAN Sample")
val sc = new SparkContext(conf)      
val data = sc.textFile("hdfs://localhost:8020/DBScan/labeled_data.csv")
val parsedData = data.map(s => Vectors.dense(s.split(',').map(_.toDouble))).cache()

val model = DBSCAN.train(
  parsedData,
  eps = 0.6,
  minPoints = 5,
  maxPointsPerPartition = 8)

model.labeledPoints.map(p => s"${p.x},${p.y},${p.cluster}").saveAsTextFile("hdfs://localhost:8020/DBScan/OPTS")

sc.stop()

} }

and this is my POM.xml

<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>

<groupId>com.irvingc.spark</groupId>
<artifactId>dbscan-on-spark_2.10</artifactId>
<version>0.2.0-SNAPSHOT</version>

<name>Distributed DBSCAN on Apache Spark</name>
<url>http://www.irvingc.com/dbscan-on-spark</url>
<packaging>jar</packaging>

<scm>
    <connection>scm:git:https://github.com/irvingc/dbscan-on-spark</connection>
    <developerConnection>scm:git:git@github.com:irvingc/dbscan-on-spark.git</developerConnection>
    <url>https://github.com/irvingc/dbscan-on-spark</url>
</scm>

<developers>
    <developer>
        <id>irvingc</id>
        <name>Irving Cordova</name>
        <email>irving@irvingc.com</email>
        <url>http://www.irvingc.com</url>
    </developer>
</developers>
<properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
    <scala.version>2.10.4</scala.version>
    <scala.binary.version>2.10</scala.binary.version>
    <scalatest.version>2.2.1</scalatest.version>
    <spark.version>2.1.0</spark.version>
</properties>

<repositories>
    <repository>
        <id>central</id>
        <!-- This should be at top, it makes maven try the central repo first 
            and then others and hence faster dep resolution -->
        <name>Maven Repository</name>
        <url>https://repo1.maven.org/maven2</url>
        <releases>
            <enabled>true</enabled>
        </releases>
        <snapshots>
            <enabled>false</enabled>
        </snapshots>
    </repository>
    <repository>
        <id>dbscan-on-spark-repo</id>
        <name>Repo for DBSCAN on Spark</name>
        <url>http://dl.bintray.com/irvingc/maven</url>
    </repository>
    <repository>
        <id>archery-repo</id>
        <name>Repo for Archery R-Tree</name>
        <url>http://dl.bintray.com/meetup/maven</url>
    </repository>
</repositories>

<dependencies>
<dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>2.10.4</version>
    </dependency> 

 <dependency>
    <groupId>org.scala-lang</groupId>
    <artifactId>scala-compiler</artifactId>
    <version>2.10.4</version>
</dependency>

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_2.10</artifactId>
        <version>1.3.0</version>
    </dependency>
    <dependency>
        <groupId>com.meetup</groupId>
        <artifactId>archery_2.10</artifactId>
        <version>0.3.0</version>
    </dependency>
org.scalatest scalatest_2.10 3.2.0-SNAP5 test
    <dependency>
    <groupId>com.irvingc.spark</groupId>
    <artifactId>dbscan_2.10</artifactId>
    <version>0.1.0</version>
</dependency>

     <dependency>     
     <groupId>org.apache.spark</groupId>   
       <artifactId>spark-core_2.10</artifactId>
            <version>1.3.0</version>
             </dependency>

             <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>2.2.1</version>
    </dependency>

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>2.2.1</version>
    </dependency>

</dependencies>

<build>
    <plugins>
        <plugin>
            <groupId>net.alchim31.maven</groupId>
            <artifactId>scala-maven-plugin</artifactId>
            <version>3.2.0</version>
            <executions>
                <execution>
                    <goals>
                        <goal>compile</goal>
                        <goal>testCompile</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>
        <plugin>
            <groupId>org.scalastyle</groupId>
            <artifactId>scalastyle-maven-plugin</artifactId>
            <version>0.6.0</version>
            <configuration>
                <verbose>false</verbose>
                <failOnViolation>true</failOnViolation>
                <includeTestSourceDirectory>true</includeTestSourceDirectory>
                <failOnWarning>true</failOnWarning>
                <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
                <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
                <configLocation>${basedir}/scalastyle-config.xml</configLocation>
                <outputFile>${project.build.directory}/scalastyle-output.xml</outputFile>
            </configuration>
            <executions>
                <execution>
                    <goals>
                        <goal>check</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>
        <plugin>
            <groupId>org.scalatest</groupId>
            <artifactId>scalatest-maven-plugin</artifactId>
            <version>1.0</version>
            <configuration>
                <reportsDirectory>${project.build.directory}/scalatest-reports</reportsDirectory>
                <junitxml>.</junitxml>
            </configuration>
            <executions>
                <execution>
                    <id>test</id>
                    <goals>
                        <goal>test</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>

        <plugin>
    <artifactId>maven-assembly-plugin</artifactId>
    <version>3.0.0</version>
    <configuration>
      <descriptorRefs>
        <descriptorRef>jar-with-dependencies</descriptorRef>
      </descriptorRefs>
      <archive>
        <manifest>
          <mainClass>DBScanClustering.MyMain</mainClass>     // specify your main class here
        </manifest>
      </archive>
    </configuration>
    <executions>
      <execution>
        <id>make-assembly</id> <!-- this is used for inheritance merges -->
        <phase>package</phase> <!-- bind to the packaging phase -->
        <goals>
          <goal>single</goal>
        </goals>
      </execution>
    </executions>
  </plugin>
    </plugins>
</build>