A fuzzy matching string distance library for Scala and Java that includes Levenshtein distance, Jaro distance, Jaro-Winkler distance, Dice coefficient, N-Gram similarity, Cosine similarity, Jaccard similarity, Longest common subsequence, Hamming distance, and more.
Works with generalized arrays.
For more detailed information, please refer to the API Documentation.
Requires: Java 8+ or Scala 2.11+
Using sbt:
In build.sbt
:
libraryDependencies += "com.github.vickumar1981" %% "stringdistance" % "1.2.7"
Using gradle:
In build.gradle
:
dependencies {
compile 'com.github.vickumar1981:stringdistance_2.13:1.2.7'
}
Using Maven:
In pom.xml
:
<dependency>
<groupId>com.github.vickumar1981</groupId>
<artifactId>stringdistance_2.13</artifactId>
<version>1.2.7</version>
</dependency>
Notes:
stringdistance_2.12
artifact as a dependency instead.stringdistance_2.11
artifact as a dependency instead. Example.scala:
// Scala example
import com.github.vickumar1981.stringdistance.StringDistance._
import com.github.vickumar1981.stringdistance.StringSound._
import com.github.vickumar1981.stringdistance.impl.{ConstantGap, LinearGap}
// Cosine Similarity
val cosSimilarity: Double = Cosine.score("hello", "chello") // 0.935
// Damerau-Levenshtein Distance
val damerauDist: Int = Damerau.distance("martha", "marhta") // 1
val damerau: Double = Damerau.score("martha", "marhta") // 0.833
// Dice Coefficient
val diceCoefficient: Double = DiceCoefficient.score("martha", "marhta") // 0.4
// Hamming Distance
val hammingDist: Int = Hamming.distance("martha", "marhta") // 2
val hamming: Double = Hamming.score("martha", "marhta") // 0.667
// Jaccard Similarity
val jaccard: Double = Jaccard.score("karolin", "kathrin", 1)
// Jaro and Jaro Winkler
val jaro: Double = Jaro.score("martha", "marhta") // 0.944
val jaroWinkler: Double = JaroWinkler.score("martha", "marhta", 0.1) // 0.961
// Levenshtein Distance
val levenshteinDist: Int = Levenshtein.distance("martha", "marhta") // 2
val levenshtein: Double = Levenshtein.score("martha", "marhta") // 0.667
// Longest Common Subsequence
val longestCommonSubSeq: Int = LongestCommonSeq.distance("martha", "marhta") // 5
// Needleman Wunsch
val needlemanWunsch: Double = NeedlemanWunsch.score("martha", "marhta", ConstantGap()) // 0.667
// N-Gram Similarity and Distance
val ngramDist: Int = NGram.distance("karolin", "kathrin", 1) // 5
val bigramDist: Int = NGram.distance("karolin", "kathrin", 2) // 2
val ngramSimilarity: Double = NGram.score("karolin", "kathrin", 1) // 0.714
val bigramSimilarity: Double = NGram.score("karolin", "kathrin", 2) // 0.333
// N-Gram tokens, returns a List[String]
val tokens: List[String] = NGram.tokens("martha", 2) // List("ma", "ar", "rt", "th", "ha")
// Overlap Similarity
val overlap: Double = Overlap.score("karolin", "kathrin", 1) // 0.286
val overlapBiGram: Double = Overlap.score("karolin", "kathrin", 2) // 0.667
// Smith Waterman Similarities
val smithWaterman: Double = SmithWaterman.score("martha", "marhta", (LinearGap(gapValue = -1), Integer.MAX_VALUE))
val smithWatermanGotoh: Double = SmithWatermanGotoh.score("martha", "marhta", ConstantGap())
// Tversky Similarity
val tversky: Double = Tversky.score("karolin", "kathrin", 0.5) // 0.333
// Phonetic Similarity
val metaphone: Boolean = Metaphone.score("merci", "mercy") // true
val soundex: Boolean = Soundex.score("merci", "mercy") // true
import com.github.vickumar1981.stringdistance.StringConverter._
Example.scala
// Scala example using implicits
import com.github.vickumar1981.stringdistance.StringConverter._
// Scores between two strings
val cosSimilarity: Double = "hello".cosine("chello")
val damerau: Double = "martha".damerau("marhta")
val diceCoefficient: Double = "martha".diceCoefficient("marhta")
val hamming: Double = "martha".hamming("marhta")
val jaccard: Double = "karolin".jaccard("kathrin")
val jaro: Double = "martha".jaro("marhta")
val jaroWinkler: Double = "martha".jaroWinkler("marhta")
val levenshtein: Double = "martha".levenshtein("marhta")
val needlemanWunsch: Double = "martha".needlemanWunsch("marhta")
val ngramSimilarity: Double = "karolin".nGram("kathrin")
val bigramSimilarity: Double = "karolin".nGram("kathrin", 2)
val overlap: Double = "karolin".overlap("kathrin")
val overlapBiGram: Double = "karolin".overlap("kathrin", 2)
val smithWaterman: Double = "martha".smithWaterman("marhta")
val smithWatermanGotoh: Double = "martha".smithWatermanGotoh("marhta")
val tversky: Double = "karolin".tversky("kathrin", 0.5)
// Distances between two strings
val damerauDist: Int = "martha".damerauDist("marhta") // 1
val hammingDist: Int = "martha".hammingDist("marhta")
val levenshteinDist: Int = "martha".levenshteinDist("marhta")
val longestCommonSeq: Int = "martha".longestCommonSeq("marhta")
val ngramDist: Int = "karolin".nGramDist("kathrin")
val bigramDist: Int = "karolin".nGramDist("kathrin", 2)
// N-Gram tokens, returns a List[String]
val tokens: List[String] = "martha".tokens(2) // List("ma", "ar", "rt", "th", "ha")
// Phonetic similarity of two strings
val metaphone: Boolean = "merci".metaphone("mercy")
val soundex: Boolean = "merci".soundex("mercy")
import com.github.vickumar1981.stringdistance.util.StringDistance
Example.java
// Java example
import com.github.vickumar1981.stringdistance.util.StringDistance;
import com.github.vickumar1981.stringdistance.util.StringSound;
// Scores between two strings
Double cosSimilarity = StringDistance.cosine("hello", "chello");
Double damerau = StringDistance.damerau("martha", "marhta");
Double diceCoefficient = StringDistance.diceCoefficient("martha", "marhta");
Double hamming = StringDistance.hamming("martha", "marhta");
Double jaccard = StringDistance.jaccard("karolin", "kathrin");
Double jaro = StringDistance.jaro("martha", "marhta");
Double jaroWinkler = StringDistance.jaroWinkler("martha", "marhta");
Double levenshtein = StringDistance.levenshtein("martha", "marhta");
Double needlemanWunsch = StringDistance.needlemanWunsch("martha", "marhta");
Double ngramSimilarity = StringDistance.nGram("karolin", "kathrin");
Double bigramSimilarity = StringDistance.nGram("karolin", "kathrin", 2);
Double overlap = StringDistance.overlap("karolin", "kathrin");
Double overlapBiGram = StringDistance.overlap("karolin", "kathrin", 2);
Double smithWaterman = StringDistance.smithWaterman("martha", "marhta");
Double smithWatermanGotoh = StringDistance.smithWatermanGotoh("martha", "marhta");
Double tversky = StringDistance.tversky("karolin", "kathrin", 0.5);
// Distances between two strings
Integer damerauDist = StringDistance.damerauDist("martha", "marhta");
Integer hammingDist = StringDistance.hammingDist("martha", "marhta");
Integer levenshteinDist = StringDistance.levenshteinDist("martha", "marhta");
Integer longestCommonSeq = StringDistance.longestCommonSeq("martha", "marhta");
Integer ngramDist = StringDistance.nGramDist("karolin", "kathrin");
Integer bigramDist = StringDistance.nGramDist("karolin", "kathrin", 2);
// N-Gram tokens, returns a List<String>
List<String> tokens = StringDistance.nGramTokens(2) // List("ma", "ar", "rt", "th", "ha")
// Phonetic similarity of two strings
Boolean metaphone = StringSound.metaphone("merci", "mercy");
Boolean soundex = StringSound.soundex("merci", "mercy");
You can use the ArrayDistance
class just like the StringDistance class,
except using a generic array - Array[T]
for Scala and T[]
for Java.
Make sure your classes are comparable using ==
for Scala or .equals
for Java
Scala Sample Code:
import com.github.vickumar1981.stringdistance.ArrayDistance._
// Example Levenshtein Distance and Score
val levenshteinDist = Levenshtein.distance(Array("m", "a", "r", "t", "h", "a"), Array("m", "a", "r", "h", "t", "a")) // 2
val levenshtein = Levenshtein.score(Array("m", "a", "r", "t", "h", "a"), Array("m", "a", "r", "h", "t", "a")) // 0.667
Java Example Code:
StringMetricAlgorithm
:trait CustomAlgorithm extends StringMetricAlgorithm
score
or the distance
method, depending upon whether the object extends DistanceAlgorithm
or ScoringAlgorithm
.implicit object CustomDistance extends DistanceAlgorithm[CustomAlgorithm] {
override def distance(s1: String, s2: String): Int = {
// Implement distance between s1 and s2
}
}
implicit object CustomScore extends ScoringAlgorithm[CustomAlgorithm] {
override def score(s1: String, s2: String): Double = {
// Implement fuzzy score between s1 and s2
}
}
StringMetric
using your algorithm as the type parameter, and use the score
and distance
methods defined in the implicit object.object CustomMetric extends StringMetric[CustomAlgorithm]
val customScore: Double = CustomMetric.score("hello", "hello2")
val customDist: Int = CustomMetric.distance("hello", "hello2")
Please report any issues or bugs to the Github issues page.
Please view the contributing guidelines
This project is licensed under the Apache 2 License.