Open amielc1 opened 2 months ago
using BloomFilter;
public class ChunkProcessor
{
private readonly SemaphoreSlim _semaphore = new SemaphoreSlim(1, 1);
private readonly IBloomFilter<string> _bloomFilter;
public ChunkProcessor()
{
// Initialize the Bloom filter with an estimated item count and false positive probability
_bloomFilter = BloomFilterFactory.Create<string>(capacity: 1000000, errorRate: 0.01);
}
public async Task ProcessChunkAsync(List<string> lines, string outputFilePath)
{
await _semaphore.WaitAsync();
try
{
await WriteUniqueLinesToFile(lines, outputFilePath);
}
finally
{
_semaphore.Release();
}
}
private async Task WriteUniqueLinesToFile(List<string> lines, string outputFilePath)
{
using (StreamWriter writer = new StreamWriter(outputFilePath, append: true))
{
foreach (var line in lines)
{
// Check if the line is likely unique and add it to the Bloom filter
if (!_bloomFilter.Contains(line))
{
_bloomFilter.Add(line);
await writer.WriteLineAsync(line);
}
}
}
}
}
To save memory, use bloom filter instead HashSet