apache / lucenenet

Apache Lucene.NET
https://lucenenet.apache.org/
Apache License 2.0
2.24k stars 638 forks source link

Lucene.Net: 4.8 SetNextReader executes repeatedly and returns only one result #915

Closed mowali closed 1 week ago

mowali commented 8 months ago

Is there an existing issue for this?

Task description

I updated from Lucene from 3.0 to 4.8, I amended my FieldComparer code. I use a comparator to get results back and I get the results via the call to SetNextReader. I only have 1 indexreader to read from:

public override void SetNextReader(IndexReader reader, int docBase)
{
    currentReaderValues = FieldCache_Fields.DEFAULT.GetStrings(reader, field);
}

public override FieldComparer SetNextReader(AtomicReaderContext context)
{
    sortedResults = FieldCache.DEFAULT.GetTermsIndex(context.AtomicReader, field);
    return this;
}

the older verison SetNextReader is executed once correctly and returning 100 results of while the new SetNextReader executes multiple time and returning only one result. Attached is my old and new code. The searcher is setup like this:

var searchFields = new search[4];
searchFields[0] = "Id";
searchFields[1] = "FirstName";
searchFields[2] = "LastName";
searchFields[3] = "Tag";

var analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48);
var parser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, searchFields, analyzer)
{
    AllowLeadingWildcard = true,
    DefaultOperator = QueryParserBase.AND_OPERATOR
};

var searchCriteria = "Jones"
var query = parser.Parse(searchCriteria);

var directory = FSDirectory.Open(new DirectoryInfo(indexDirectory));
DirectoryReader di = DirectoryReader.Open(directory);
var searcher = new IndexSearcher(di);   

var filter = new QueryWrapperFilter(query);
var topDocs = searcher.Search(query, filter, (1 * 25), sort);

// OLD COMPARATOR CODE WITH 3.0
internal class TagComparator : FieldComparator
{
    private string[] values;
    private string[] currentReaderValues;
    private string field;
    private string bottom; 
    private bool reversed;

    public TagComparator(int numHits, string field, bool reversed)
    {
        values = new string[numHits];
        this.field = field;
        this.reversed = reversed;
    }

    public override int Compare(int slot1, int slot2)
    {
        string v1 = values[slot1];
        string v2 = values[slot2];

        return DoCompare(v1, v2);
    }

    public override int CompareBottom(int doc)
    {
        string v2 = currentReaderValues[doc];

        return DoCompare(bottom, v2);
    }

    private int DoCompare(string v1, string v2)
    {
        if (string.IsNullOrEmpty(v1))
        {
            if (string.IsNullOrEmpty(v2))
            {
                return 0;
            }

            return reversed ? -1 : 1;
        }

        if (string.IsNullOrEmpty(v2))
        {
            return reversed ? 1 : -1;
        }

        return v1.CompareTo(v2);
    }

    public override void Copy(int slot, int doc)
    {
        values[slot] = currentReaderValues[doc];
    }

    public override void SetNextReader(IndexReader reader, int docBase)
    {
        currentReaderValues = FieldCache_Fields.DEFAULT.GetStrings(reader, field);
    }

    public override void SetBottom(int bottom)
    {
        this.bottom = values[bottom];
    }

    public override IComparable this[int slot] => values[slot];
}

//NEW COMPARER CODE WITH 4.8 **
internal class TagComparator : FieldComparer<BytesRef>
{
    protected readonly ILog Log;
    private readonly BytesRef[] bvalues;
    private readonly string field;
    private readonly bool reversed;
    private BytesRef termCopy = new BytesRef();
    private SortedDocValues sortedResults;
    private int bottomSlot;

    public override BytesRef this[int slot] => bvalues[slot];

    public TagComparator(int numHits, string field, bool reversed)
    {
        bvalues = new BytesRef[numHits];
        this.field = field;
        this.reversed = reversed;
    }

    public override int Compare(int slot1, int slot2)
    {
        var result1 = DoCompare(bvalues[slot1], bvalues[slot2]);
        return result1;
    }

    private int DoCompare(BytesRef v1, BytesRef v2)
    {
        if (v1.Length == 0)
        {
            if (v2.Length == 0)
            {
                return 0;
            }

            return reversed ? -1 : 1;
        }

        if (v2.Length == 0)
        {
            return reversed ? 1 : -1;
        }

        if (v1.CompareTo(v2) > 0)
            return 1;
        else
            return -1;
    }

    public override void Copy(int slot, int doc)
    {
        termCopy = new BytesRef();
        sortedResults.Get(doc, termCopy);
        bvalues[slot] = termCopy;
    }

    public override int CompareBottom(int doc)
    {
        BytesRef termOrd = new BytesRef();
        int ord = sortedResults.GetOrd(doc);
        sortedResults.LookupOrd(ord, termOrd);
        var result = DoCompare(bvalues[bottomSlot], termOrd);

        return result;
    }

    public override void SetBottom(int bottom)
    {
        bottomSlot = bottom;
    }

    public override FieldComparer SetNextReader(AtomicReaderContext context)
    {
        sortedResults = FieldCache.DEFAULT.GetTermsIndex(context.AtomicReader, field);
        return this;
    }

    public override int CompareTop(int doc)
    {
        throw new NotImplementedException();
    }

    public override void SetTopValue(BytesRef value)
    {
        throw new NotImplementedException();
    }
}
NightOwl888 commented 8 months ago

Maybe I am missing something, but why aren't you using the built-in string comparer (TermOrdValComparer) with reverse support?

https://github.com/apache/lucenenet/blob/bed207a81acb7d100a5545dc28eeeb66d35e06ba/src/Lucene.Net.Tests/Search/TestSort.cs#L350-L384

https://github.com/apache/lucenenet/blob/bed207a81acb7d100a5545dc28eeeb66d35e06ba/src/Lucene.Net.Tests/Search/TestSort.cs#L279-L312

If the custom FieldComparer is required, your main issue is that you are allocating memory inside of the comparison methods. Comparers should only allocate memory in the class constructor and/or field initializers, never in the comparison methods, because this will cause a lot of garbage collection overhead.

    public override void Copy(int slot, int doc)
    {
        termCopy = new BytesRef(); // Expensive allocation. This method will be called a lot.
        sortedResults.Get(doc, termCopy);
        bvalues[slot] = termCopy;
    }

    public override int CompareBottom(int doc)
    {
        BytesRef termOrd = new BytesRef(); // Expensive allocation. This method will be called a lot.
        int ord = sortedResults.GetOrd(doc);
        sortedResults.LookupOrd(ord, termOrd); // Expensive lookup. The TermOrdValComparer only does lookups in the Copy() method and only when necessary.
        var result = DoCompare(bvalues[bottomSlot], termOrd);

        return result;
    }

Also, the TermOrdValComparer uses ords to do the comparison in most cases, falling back to term comparison only when necessary. See the TermOrdValComparer docs and FieldComparer<T> docs.

paulirwin commented 1 week ago

See response from @NightOwl888 above. Feel free to reopen if you feel like there is more to discuss here. Thanks!