KevinStern / software-and-algorithms

Neat algorithm implementations in Java.
MIT License
118 stars 70 forks source link

I have implemented this in C++ but I get distances incremented by 1 #2

Closed brokenthorn closed 11 years ago

brokenthorn commented 11 years ago

My following implementation which must be called like _calculate(target, source) even though the signature was copied from your Java code, produces results incremented by 1 compared to, say this.

#include "dameraulevenshtein.h"
#include <boost/numeric/ublas/matrix.hpp>
#include <QMap>

DamerauLevenshtein::DamerauLevenshtein(int deleteCost, int insertCost, int replaceCost, int swapCost, QObject *parent) :
    QObject(parent), deleteCost(deleteCost), insertCost(insertCost), replaceCost(replaceCost), swapCost(swapCost)
{
}

unsigned int DamerauLevenshtein::calculate(QString &source, QString &target)
{
    return this->_calculate(source, target);
}

/**
 * @brief Compute the Damerau-Levenshtein distance between the specified source string and the specified target string.
 * @param Source string.
 * @param Target string.
 * @return Damerau-Levenshtein distance as unsigned int.
 */
unsigned int DamerauLevenshtein::_calculate(QString &source, QString &target)
{
    /*
     * Required to facilitate the premise to the algorithm that two swaps of
     * the same character are never required for optimality.
     */
    if (2 * swapCost < insertCost + deleteCost)
        return -1;

    using namespace boost::numeric::ublas;

    int slen = source.length();
    int tlen = target.length();
    matrix<int>* table = new matrix<int>(slen, tlen);

    if (source.at(0) != target.at(0))
        table->insert_element(0, 0, std::min(replaceCost, deleteCost + insertCost));

    QMap<QChar, int> sourceIndexByCharacterQMap;
    sourceIndexByCharacterQMap.insert(source.at(0), 0);

    for (int i = 1; i < source.length(); i++)
    {
        int deleteDistance = table->at_element(i - 1, 0) = deleteCost;
        int insertDistance = (i + 1) * deleteCost + insertCost;
        int matchDistance = i * deleteCost
                + (source.at(i) == target.at(0) ? 0 : replaceCost);
        table->at_element(i, 0) = std::min(std::min(deleteDistance, insertDistance), matchDistance);
    }

    for (int j = 1; j < tlen; j++)
    {
        int deleteDistance = table->at_element(0, j - 1) + insertCost;
        int insertDistance = (j + 1) * insertCost + deleteCost;
        int matchDistance = j * insertCost
                + (source.at(0) == target.at(j) ? 0 : replaceCost);
        table->at_element(0, j) = std::min(std::min(deleteDistance, insertDistance),
                               matchDistance);
    }

    for (int i = 1; i < slen; i++) {
        int maxSourceLetterMatchIndex = source.at(i) == target.at(0) ? 0 : -1;
        for (int j = 1; j < tlen; j++) {
            bool foundCandidateSwapIndex = false;
            int candidateSwapIndex = 0;
            if (sourceIndexByCharacterQMap.contains(target.at(j)))
            {
                foundCandidateSwapIndex = true;
                candidateSwapIndex = sourceIndexByCharacterQMap[target.at(j)];
            }
            int jSwap = maxSourceLetterMatchIndex;
            int deleteDistance = table->at_element(i - 1, j) + deleteCost;
            int insertDistance = table->at_element(i, j - 1) + insertCost;
            int matchDistance = table->at_element(i - 1, j - 1);
            if (source.at(i) != target.at(j)) {
                matchDistance += replaceCost;
            } else {
                maxSourceLetterMatchIndex = j;
            }
            int swapDistance;
            if (foundCandidateSwapIndex && jSwap != -1) {
                int iSwap = candidateSwapIndex;
                int preSwapCost;
                if (iSwap == 0 && jSwap == 0) {
                    preSwapCost = 0;
                } else {
                    preSwapCost = table->at_element(std::max(0, iSwap - 1), std::max(0, jSwap - 1));
                }
                swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost
                        + (j - jSwap - 1) * insertCost + swapCost;
            } else {
                swapDistance = INT_MAX;
            }
            table->at_element(i, j) = std::min(
                        std::min(
                            std::min(
                                deleteDistance,
                                insertDistance),
                            matchDistance),
                        swapDistance);
        }
        sourceIndexByCharacterQMap.insert(source.at(i), i);
    }

    int result = table->at_element(slen - 1, tlen - 1);
    return result;
}
KevinStern commented 11 years ago

I've ported this to C++ myself, here: https://github.com/KevinStern/software-and-algorithms-cpp/blob/master/damerau_levenshtein.h

Do you notice the same problem with my port? If so, could you provide a specific string pair that gives a delta with the website that you specified?

On Sat, Mar 16, 2013 at 3:17 PM, brokenthorn notifications@github.comwrote:

My following implementation which must be called like _calculate(target, source) even though the signature was copied from your Java code, produces results incremented by 1 compared to, say thishttp://fuzzy-string.com/Compare/ .

include "dameraulevenshtein.h"#include <boost/numeric/ublas/matrix.hpp>#include

DamerauLevenshtein::DamerauLevenshtein(int deleteCost, int insertCost, int replaceCost, int swapCost, QObject _parent) : QObject(parent), deleteCost(deleteCost), insertCost(insertCost), replaceCost(replaceCost), swapCost(swapCost){} unsigned int DamerauLevenshtein::calculate(QString &source, QString &target){ return this->calculate(source, target);} /* * @brief Compute the Damerau-Levenshtein distance between the specified source string and the specified target string. * @param Source string. * @param Target string. * @return Damerau-Levenshtein distance as unsigned int. _/unsigned int DamerauLevenshtein::calculate(QString &source, QString &target){ / * Required to facilitate the premise to the algorithm that two swaps of * the same character are never required for optimality. / if (2 \ swapCost < insertCost + deleteCost) return -1;

using namespace boost::numeric::ublas;

int slen = source.length();
int tlen = target.length();
matrix<int>* table = new matrix<int>(slen, tlen);

if (source.at(0) != target.at(0))
    table->insert_element(0, 0, std::min(replaceCost, deleteCost + insertCost));

QMap<QChar, int> sourceIndexByCharacterQMap;
sourceIndexByCharacterQMap.insert(source.at(0), 0);

for (int i = 1; i < source.length(); i++)
{
    int deleteDistance = table->at_element(i - 1, 0) = deleteCost;
    int insertDistance = (i + 1) * deleteCost + insertCost;
    int matchDistance = i * deleteCost
            + (source.at(i) == target.at(0) ? 0 : replaceCost);
    table->at_element(i, 0) = std::min(std::min(deleteDistance, insertDistance), matchDistance);
}

for (int j = 1; j < tlen; j++)
{
    int deleteDistance = table->at_element(0, j - 1) + insertCost;
    int insertDistance = (j + 1) * insertCost + deleteCost;
    int matchDistance = j * insertCost
            + (source.at(0) == target.at(j) ? 0 : replaceCost);
    table->at_element(0, j) = std::min(std::min(deleteDistance, insertDistance),
                           matchDistance);
}

for (int i = 1; i < slen; i++) {
    int maxSourceLetterMatchIndex = source.at(i) == target.at(0) ? 0 : -1;
    for (int j = 1; j < tlen; j++) {
        bool foundCandidateSwapIndex = false;
        int candidateSwapIndex = 0;
        if (sourceIndexByCharacterQMap.contains(target.at(j)))
        {
            foundCandidateSwapIndex = true;
            candidateSwapIndex = sourceIndexByCharacterQMap[target.at(j)];
        }
        int jSwap = maxSourceLetterMatchIndex;
        int deleteDistance = table->at_element(i - 1, j) + deleteCost;
        int insertDistance = table->at_element(i, j - 1) + insertCost;
        int matchDistance = table->at_element(i - 1, j - 1);
        if (source.at(i) != target.at(j)) {
            matchDistance += replaceCost;
        } else {
            maxSourceLetterMatchIndex = j;
        }
        int swapDistance;
        if (foundCandidateSwapIndex && jSwap != -1) {
            int iSwap = candidateSwapIndex;
            int preSwapCost;
            if (iSwap == 0 && jSwap == 0) {
                preSwapCost = 0;
            } else {
                preSwapCost = table->at_element(std::max(0, iSwap - 1), std::max(0, jSwap - 1));
            }
            swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost
                    + (j - jSwap - 1) * insertCost + swapCost;
        } else {
            swapDistance = INT_MAX;
        }
        table->at_element(i, j) = std::min(
                    std::min(
                        std::min(
                            deleteDistance,
                            insertDistance),
                        matchDistance),
                    swapDistance);
    }
    sourceIndexByCharacterQMap.insert(source.at(i), i);
}

int result = table->at_element(slen - 1, tlen - 1);
return result;}

— Reply to this email directly or view it on GitHubhttps://github.com/KevinStern/software-and-algorithms/issues/2 .

brokenthorn commented 11 years ago

I don't find the problem anymore with this implementation but I have already ported another Java implementation (LingPipe) that worked the first time. I might compare this and that to see which is better/faster.

On Wed, Mar 20, 2013 at 12:13 PM, KevinStern notifications@github.comwrote:

I've ported this to C++ myself, here:

https://github.com/KevinStern/software-and-algorithms-cpp/blob/master/damerau_levenshtein.h

Do you notice the same problem with my port? If not, could you provide a specific string pair that gives a delta with the website that you specified?

On Sat, Mar 16, 2013 at 3:17 PM, brokenthorn notifications@github.comwrote:

My following implementation which must be called like _calculate(target, source) even though the signature was copied from your Java code, produces results incremented by 1 compared to, say this< http://fuzzy-string.com/Compare/> .

include "dameraulevenshtein.h"#include

<boost/numeric/ublas/matrix.hpp>#include DamerauLevenshtein::DamerauLevenshtein(int deleteCost, int insertCost, int replaceCost, int swapCost, QObject _parent) : QObject(parent), deleteCost(deleteCost), insertCost(insertCost), replaceCost(replaceCost), swapCost(swapCost){} unsigned int DamerauLevenshtein::calculate(QString &source, QString &target){ return this->calculate(source, target);} /* * @brief Compute the Damerau-Levenshtein distance between the specified source string and the specified target string. * @param Source string. * @param Target string. * @return Damerau-Levenshtein distance as unsigned int. _/unsigned int DamerauLevenshtein::calculate(QString &source, QString &target){ / * Required to facilitate the premise to the algorithm that two swaps of * the same character are never required for optimality. / if (2 \ swapCost < insertCost + deleteCost) return -1;

using namespace boost::numeric::ublas;

int slen = source.length(); int tlen = target.length(); matrix* table = new matrix(slen, tlen);

if (source.at(0) != target.at(0)) table->insert_element(0, 0, std::min(replaceCost, deleteCost + insertCost));

QMap<QChar, int> sourceIndexByCharacterQMap; sourceIndexByCharacterQMap.insert(source.at(0), 0);

for (int i = 1; i < source.length(); i++) { int deleteDistance = table->at_element(i - 1, 0) = deleteCost; int insertDistance = (i + 1) * deleteCost + insertCost; int matchDistance = i * deleteCost

  • (source.at(i) == target.at(0) ? 0 : replaceCost); table->at_element(i, 0) = std::min(std::min(deleteDistance, insertDistance), matchDistance); }

for (int j = 1; j < tlen; j++) { int deleteDistance = table->at_element(0, j - 1) + insertCost; int insertDistance = (j + 1) * insertCost + deleteCost; int matchDistance = j * insertCost

  • (source.at(0) == target.at(j) ? 0 : replaceCost); table->at_element(0, j) = std::min(std::min(deleteDistance, insertDistance), matchDistance); }

for (int i = 1; i < slen; i++) { int maxSourceLetterMatchIndex = source.at(i) == target.at(0) ? 0 : -1; for (int j = 1; j < tlen; j++) { bool foundCandidateSwapIndex = false; int candidateSwapIndex = 0; if (sourceIndexByCharacterQMap.contains(target.at(j))) { foundCandidateSwapIndex = true; candidateSwapIndex = sourceIndexByCharacterQMap[target.at(j)]; } int jSwap = maxSourceLetterMatchIndex; int deleteDistance = table->at_element(i - 1, j) + deleteCost; int insertDistance = table->at_element(i, j - 1) + insertCost; int matchDistance = table->at_element(i - 1, j - 1); if (source.at(i) != target.at(j)) { matchDistance += replaceCost; } else { maxSourceLetterMatchIndex = j; } int swapDistance; if (foundCandidateSwapIndex && jSwap != -1) { int iSwap = candidateSwapIndex; int preSwapCost; if (iSwap == 0 && jSwap == 0) { preSwapCost = 0; } else { preSwapCost = table->at_element(std::max(0, iSwap - 1), std::max(0, jSwap - 1)); } swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost

  • (j - jSwap - 1) * insertCost + swapCost; } else { swapDistance = INT_MAX; } table->at_element(i, j) = std::min( std::min( std::min( deleteDistance, insertDistance), matchDistance), swapDistance); } sourceIndexByCharacterQMap.insert(source.at(i), i); }

int result = table->at_element(slen - 1, tlen - 1); return result;}

— Reply to this email directly or view it on GitHub< https://github.com/KevinStern/software-and-algorithms/issues/2> .

— Reply to this email directly or view it on GitHubhttps://github.com/KevinStern/software-and-algorithms/issues/2#issuecomment-15167003 .

Manole Paul-Sebastian Blog: obsidianlake.blogspot.com Twitter: @brokenthorn

brokenthorn commented 11 years ago

P. Sebastian Manole sent you an invitation

Twitter helps you stay connected with what's happening right now and with the people and organizations you care about.

Accept invitation

https://twitter.com/i/de2e0a4b-5304-45d0-a9be-26a4026ebed3

This message was sent by Twitter on behalf of Twitter users who entered your email address to invite you to Twitter. Unsubscribe: https://twitter.com/i/o?t=1&iid=6b877a74-8cdf-4bd7-8976-be501aa6cde5&uid=0&c=IuVwbCB3dgFHCKEuuXJ0sXJwbtKFpdXl77BFKYYDeyhVThUhXvh3KHKX3Ga%2BARYcf7snTJDkX3BnizrQLkX1ayv8fgZwFeYnoV3M%2FXCXzEQr%2BQSKP29UOg%3D%3D&nid=9+26

Need help? https://support.twitter.com