amielc1 / LearnAsync

0 stars 0 forks source link

Use Bloom filter instead hashset #2

Open amielc1 opened 2 months ago

amielc1 commented 2 months ago

To save memory, use bloom filter instead HashSet

amielc1 commented 2 months ago
using BloomFilter;

public class ChunkProcessor
{
    private readonly SemaphoreSlim _semaphore = new SemaphoreSlim(1, 1);
    private readonly IBloomFilter<string> _bloomFilter;

    public ChunkProcessor()
    {
        // Initialize the Bloom filter with an estimated item count and false positive probability
        _bloomFilter = BloomFilterFactory.Create<string>(capacity: 1000000, errorRate: 0.01);
    }

    public async Task ProcessChunkAsync(List<string> lines, string outputFilePath)
    {
        await _semaphore.WaitAsync();

        try
        {
            await WriteUniqueLinesToFile(lines, outputFilePath);
        }
        finally
        {
            _semaphore.Release();
        }
    }

    private async Task WriteUniqueLinesToFile(List<string> lines, string outputFilePath)
    {
        using (StreamWriter writer = new StreamWriter(outputFilePath, append: true))
        {
            foreach (var line in lines)
            {
                // Check if the line is likely unique and add it to the Bloom filter
                if (!_bloomFilter.Contains(line))
                {
                    _bloomFilter.Add(line);
                    await writer.WriteLineAsync(line);
                }
            }
        }
    }
}