Performance issue with prepare!( doc, strip_stopwords)

asbisen commented 5 years ago

Calling prepare!(StringDocument, strip_case | strip_stopwords) even on a small ~3.4MB file takes forever to return (other than for small strings I have not seen this function finishing successfully).

function method_textanalysis(str_rec)
    sdoc = StringDocument(str_rec)
    prepare!(sdoc, strip_case | strip_stopwords)
    return sdoc
end

I have tracked the slowness to the following function.

https://github.com/JuliaText/TextAnalysis.jl/blob/c8ae7a217d19f19d8c8e3e22da9ea5970ece40d4/src/preprocessing.jl#L253

function remove_patterns(s::AbstractString, rex::Regex)
    iob = IOBuffer()
    ibegin = 1
    v=codeunits(s)
    for m in eachmatch(rex, s)
        len = m.match.offset-ibegin+1
    next = nextind(s, lastindex(m.match)+m.match.offset)
        if len > 0
            Base.write_sub(iob, v, ibegin, len)
        if  next != length(s)+1
                write(iob, ' ')
        end
        end
        ibegin = next
    end
    len = length(v) - ibegin + 1
    (len > 0) && Base.write_sub(iob, v, ibegin, len)
    String(take!(iob))
end

Manually performing similar task takes ~1.4 second on a 3.4MB text file. Reason I say similar is because to eliminate stop words manually I first tokenize the document and then filter out the stop words. Which functionally is very different than executing regex on a large string and may not be the ideal approach for preserving the structure of document. (complete code at the end)

function method_manual(str_rec)
    stop_words = Languages.stopwords(Languages.English())
    str_rec = lowercase(str_rec)
    word_tokens = tokenize(str_rec)
    res = filter(x->!in(x, stop_words), word_tokens)
    return res
end

I was wondering if there could be a more efficient way to perform this elimination of keywords from a String Document?

using Pkg
@info "Installing required packages.."
required_pkg = ["TextAnalysis", "Languages", "WordTokenizers"]
installed_pkg = Pkg.installed()
[(in(p, keys(installed_pkg)) || Pkg.add(p)) for p in required_pkg]

using TextAnalysis
using WordTokenizers
using Languages

"""
Download data if it does not already exists
"""
function download_data(;
    url="http://www.gutenberg.org/files/2600/2600-0.txt", 
    localfile="2600-0.txt")

    if !isfile(localfile)
        download(url, localfile)
    else
        @info "file $(localfile) already exists, skipping download"
    end
end

"""
Return data (~100MB uncompressed) in form of a string
"""
function getdata(fn="2600-0.txt")
    download_data()
    str_rec = read(fn, String)
end

"""
Pre Process data using TextAnalysis - strip_case | strip_stopwords
"""
function method_textanalysis(str_rec)
    sdoc = StringDocument(str_rec)
    #prepare!(sdoc, strip_case | strip_stopwords)
    prepare!(sdoc, strip_stopwords)
    return sdoc
end

"""
Pre Process data without using `prepare!`
"""
function method_manual(str_rec)
    stop_words = Languages.stopwords(Languages.English())
    str_rec = lowercase(str_rec)
    word_tokens = tokenize(str_rec)
    res = filter(x->!in(x, stop_words), word_tokens)
    return res
end

"""
Main
"""
function main()
    str_rec = getdata()

    @info "Manual Pre Processing"
    @time res2 = method_manual(str_rec)

    @info "Pre Processing using TextAnalysis"
    @time res1 = method_textanalysis(str_rec)

end

main()

asbisen commented 5 years ago

FYI: quick test of using replace while looping over stop_words appears to be much faster than the existing remove_pattern method ~<2s vs ~940s.

function preprocess(str_rec)
    stop_words = Languages.stopwords(Languages.English())
    str_rec = lowercase(str_rec)
    for sw in stop_words
        rex = Regex("\\b"*sw*"\\b")
        str_rec = replace(str_rec, rex => "")
    end
    return str_rec
end

GdMacmillan commented 5 years ago

Not sure if my input is warranted but I just wanted to post a solution I found worked. However, this processes removes stop words from the return value of tokenize (using WordTokenizers)

STOPWORDS = stopwords(Languages.English()); # using Languages

"""
my_tokenize(text, sw)

return iterator for tokenized words in text with stopwords removed by default.
to return only stopwords in text, set argument sw to \'only\'
"""
function my_tokenize(text, sw::String="remove")
    if sw == "remove"
        return collect(word for word in tokenize(text) if !isin(word, STOPWORDS))
    elseif sw == "only"
        return collect(word for word in tokenize(text) if isin(word, STOPWORDS))
    else
        return collect(word for word in tokenize(text))
    end
end

I then apply it like:

purpose = select(t_new, :purpose);
lower = lowercase.(purpose);
num_words = length.(my_tokenize.(lower));

I'm welcome to hearing improvements but this was fast and worked for my use case

JuliaText / TextAnalysis.jl

Performance issue with prepare!( doc, strip_stopwords) #140