google / cld3

Apache License 2.0
776 stars 109 forks source link

"https" makes france to identidy as english #19

Open ntedgi opened 5 years ago

ntedgi commented 5 years ago

used simple javacpp adapter https://github.com/bytedeco/javacpp

import org.bytedeco.javacpp.Loader;
import org.bytedeco.javacpp.Pointer;
import org.bytedeco.javacpp.annotation.Platform;

@Platform(
    include = {"LangDetect.h"},
    link = {"cLangDetect"}
)
public class LangDetect extends Pointer {
    private native void detect(String var1, int var2, LangData var3);

    private native void allocate();

    public LangDetect() {
        this.allocate();
    }

    public void detect(String str, LangData result) {
        this.detect(str, str.length(), result);
    }

    static {
        Loader.load();
    }
}

test with http inside url

object CLD2Example {
    @JvmStatic
    fun main(args: Array<String>) {
        val ldetect = LangDetect()
        val ld = LangData()
        ldetect.detect("Sampension,3ème caisse de retraite danoise\uD83C\uDDE9\uD83C\uDDF0,#BoycottIsrael,boycott 4 firmes liées à des colonies!\n" +
                "#GroupPalestine\n" +
                "⏩https://t.co/gPIEbpotvk https://t.co/P4TaPvcBdX ", ld)
        println(ld.getName(0))
        println(ld.getScore(0))
    }
}

output: ENGLISH 0.99

test without http inside url

object CLD2Example {
    @JvmStatic
    fun main(args: Array<String>) {
        val ldetect = LangDetect()
        val ld = LangData()
        ldetect.detect("Sampension,3ème caisse de retraite danoise\uD83C\uDDE9\uD83C\uDDF0,#BoycottIsrael,boycott 4 firmes liées à des colonies!\n" +
                "#GroupPalestine\n" +
                "⏩://t.co/gPIEbpotvk ://t.co/P4TaPvcBdX ", ld)
        println(ld.getName(0))
        println(ld.getScore(0))
    }
}

output: FRENCH 0.99

bsolomon1124 commented 4 years ago

Unlike cld2, I don't believe that cld3 removes http[s] by default.