fabriziosalmi / blacklists

Hourly updated domains blacklist 🚫
https://github.com/fabriziosalmi/blacklists/releases/download/latest/blacklist.txt
GNU General Public License v3.0
113 stars 5 forks source link

sanitize.py #82

Closed fabriziosalmi closed 9 months ago

fabriziosalmi commented 10 months ago
import re
import tldextract
from tqdm import tqdm

# Pre-compiled regex pattern for FQDN validation
FQDN_PATTERN = re.compile('^(?!-)[A-Za-z0-9-]{1,63}(?<!-)$')

def is_valid_fqdn(s):
    """Check if the string is a valid FQDN."""
    if not s or '*' in s:
        return False

    extracted = tldextract.extract(s)
    if not all([extracted.domain, extracted.suffix]):
        return False

    return all(FQDN_PATTERN.match(part) for part in s.split('.'))

def remove_prefix(line, *prefixes):
    """Remove specified prefixes from a line if it starts with any of them."""
    for prefix in prefixes:
        if line.startswith(prefix):
            potential_fqdn = line[len(prefix):].strip()  # Ensure removal of any extra spaces
            if is_valid_fqdn(potential_fqdn):
                return potential_fqdn.strip()
    return line

def get_sanitization_rules():
    """Returns a list of sanitization rules."""
    return [
        lambda line: None if line.startswith("#") else line,  # Remove comment lines
        lambda line: remove_prefix(line, "127.0.0.1", "0.0.0.0", "||", "http://", "https://"),  # Remove specific prefixes
        lambda line: line.rstrip('.'),  # Remove trailing dot
        lambda line: line.lower()  # Convert to lowercase
    ]

def process_large_file(input_file_path, output_file_path):
    """Process large files line by line."""
    unique_domains = set()
    rules = get_sanitization_rules()

    with open(input_file_path, 'r') as infile:
        total_lines = sum(1 for _ in infile)
        infile.seek(0)  # Reset file pointer to the start
        for line in tqdm(infile, total=total_lines, desc="Processing"):
            sanitized_line = sanitize_line(line, rules)
            if sanitized_line and is_valid_fqdn(sanitized_line):
                unique_domains.add(sanitized_line)

    # Write the sorted unique domain names to the output file
    with open(output_file_path, 'w') as outfile:
        for domain in tqdm(sorted(unique_domains), desc="Writing"):
            outfile.write(domain + '\n')

def sanitize_line(line, rules):
    """Apply all sanitization rules to a line."""
    for rule in rules:
        line = rule(line.strip())
        if line is None:
            return None
    return line

# Use this function to process your large file
process_large_file('input.txt', 'output.txt')