mnestorov / file-classifier

The File Classifier is a Python script that monitors a specified directory (typically the Downloads folder) and automatically organizes files into different folders based on their file types.
MIT License
3 stars 2 forks source link

Not an issue, more of a suggestion #1

Open fabiendostie opened 5 months ago

fabiendostie commented 5 months ago

I find the list of extension quite limited. I have put together a quite more extensive ;list, I dont know if you'd like to implement it in the code. I will in my fork. here it is:

file_extensions = {
    "Audio": [
        ".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".aiff", ".au", ".m4a", ".mp2",
        ".mpc", ".opus", ".ra", ".raw", ".vox", ".wv", ".webm", ".8svx", ".cda", ".mid",
        ".midi", ".mod", ".mpa", ".oga", ".s3m", ".spx", ".3ga", ".aax", ".ac3", ".amr",
        ".ape", ".asf", ".ast", ".awb", ".dts", ".flp", ".gsm", ".iklax", ".ivs", ".m3u",
        ".m4b", ".m4r", ".mmf", ".msv", ".nmf", ".nsf", ".nwc", ".pcm", ".qcp", ".tta"
    ],
    "Video": [
        ".mp4", ".avi", ".flv", ".mov", ".wmv", ".mkv", ".m4v", ".mpg", ".mpeg", ".vob",
        ".rmvb", ".ogv", ".3gp", ".3g2", ".drc", ".gifv", ".m2v", ".ogx", ".svi", ".yuv",
        ".264", ".3gpp", ".asx", ".bik", ".braw", ".divx", ".dv", ".evo", ".f4v", ".flc",
        ".h264", ".hevc", ".m2ts", ".m8", ".mnv", ".mts", ".nsv", ".nuv", ".pva", ".r3d",
        ".rl2", ".roq"
    ],
    "Images": [
        ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".psd", ".eps", ".ai", ".indd",
        ".raw", ".cr2", ".nef", ".orf", ".sr2", ".dwg", ".dxf", ".heif", ".ico",
        ".jng", ".jxl", ".pbm", ".pcx", ".pict", ".apng", ".avif", ".bpg", ".cgm",
        ".cmx", ".dib", ".djv", ".flif", ".hdr", ".hrz", ".ilbm", ".lbm", ".miff",
        ".niff", ".nol", ".pam", ".pcd", ".pgm", ".ppm", ".ras", ".sgi", ".tga", ".wbmp",
        ".xpm"
    ],
    "Vectorial Images": [
        ".svg", ".cdr", ".eps", ".ai", ".afdesign", ".avit", ".e2d", ".fig", ".sk", ".sk1", ".sxd", ".v2d",
        ".vml", ".wmf", ".xar", ".xcf", ".vsd", ".ppt", ".odg", ".svgz", ".drw", ".emf",
        ".gt2", ".hpgl", ".iges", ".mgcb", ".plt", ".rdp", ".sda", ".sdr", ".stl", ".svf",
        ".swf", ".tikz", ".wmz", ".xaml", ".xd", ".xmind", ".3dv", ".amf", ".art", ".asc",
        ".bvh"
    ],
    "Installers": [
        ".exe", ".msi", ".dmg", ".pkg", ".deb", ".rpm", ".appimage", ".run", ".bat", ".cmd",
        ".bin", ".app", ".gadget", ".jar", ".wsf", ".aam", ".air", ".appx", ".awb", ".crx",
        ".ipk", ".isu", ".job", ".jse", ".tar.gz", ".tgz", ".bz2", ".lz", ".lzma", ".lzo",
        ".xz", ".z", ".7zip", ".ace", ".afa", ".alz", ".arc", ".arj", ".bz", ".cabinet",
        ".cpio", ".dar", ".dd", ".ear", ".gca", ".ha"
    ],
    "Documents": [
        ".doc", ".docx", ".pdf", ".txt", ".odt", ".rtf", ".xls", ".xlsx", ".pptx",
        ".odp", ".ods", ".md", ".epub", ".djvu", ".mobi",
        ".azw", ".azw3", ".fb2", ".ibooks", ".cbr", ".cbz", ".abw", ".ans", ".asc", ".aww",
        ".ccf", ".chm", ".clkw", ".docm", ".dot", ".dotx", ".egnt", ".fdx", ".ftm", ".ftx",
        ".gdoc", ".hwp", ".hwpml", ".log", ".lwp", ".mbp", ".me", ".nbp", ".neis", ".nq"
    ],
    "Archives": [
        ".zip", ".rar", ".7z", ".tar", ".gz", ".apk", ".arj",
        ".cab", ".iso", ".jar", ".part", ".pea", ".s7z", ".sit",
        ".sitx", ".zipx", ".zoo", ".war", ".cdx", ".cso",
        ".dgc", ".hki", ".ice", ".j", ".lha", ".lzh",
        ".lzx", ".pak", ".rar5", ".rk", ".sen", ".sfx", ".shar", ".sqx", ".uue", ".warc"
    ],
    "Programming": [
        ".py", ".js", ".java", ".c", ".cpp", ".cs", ".php", ".rb", ".swift", ".go", ".ts",
        ".pl", ".lua", ".groovy", ".scala", ".rs", ".kt", ".m", ".dart", ".pas", ".asm",
        ".vbs", ".s", ".h", ".hpp", ".ada", ".adb", ".ads", ".agda", ".asmx", ".awk",
        ".bash", ".bsh", ".cls", ".cob", ".coffee", ".cppm", ".csx", ".cu", ".cuh",
        ".d", ".erl", ".f", ".f90", ".f95", ".fs", ".gml", ".hcl", ".hs"
    ],
    "Web": [
        ".html", ".css", ".php", ".asp", ".jsp", ".aspx", ".cgi", ".xml", ".ajax",
        ".cfm", ".html5", ".xhtml", ".rss", ".atom", ".scss", ".less", ".sass", ".wasm", ".vue",
        ".svelte", ".erb", ".haml", ".handlebars", ".hbs", ".jspf", ".liquid", ".mustache",
        ".phtml", ".rhtml", ".slim", ".tmpl", ".twig", ".volt", ".xht", ".xsl", ".yaml", ".yml",
        ".do", ".jhtm", ".jspx", ".jst", ".lda", ".rjs", ".tld"
    ],
    "Databases": [
        ".db", ".mdb", ".accdb", ".sqlite", ".dbf", ".mdf", ".ora", ".fdb", ".db2",
        ".ib", ".myd", ".myi", ".frm", ".odb", ".pdb", ".sqlitedb", ".sqlite3", ".dat", ".db3",
        ".sdb", ".s3db", ".dbk", ".dbx", ".dcb", ".fmp", ".fp5", ".fp7", ".gdb", ".kdb", ".accde",
        ".adp", ".daf", ".edb", ".fmp12", ".frx", ".itdb", ".mdbhtml", ".ndf", ".nsf", ".nv2",
        ".nyf", ".ora", ".pdm", ".prc", ".tdb"
    ],
    "Datasets": [".csv", ".xlsx", ".json", ".sql"
    ],
    "Fonts": [
        ".ttf", ".otf", ".woff", ".woff2", ".eot", ".sfnt", ".font", ".pfb", ".pfm", ".afm",
        ".bin", ".cff", ".dfont", ".gst", ".pfa", ".sfd", ".std", ".svg", ".ttc", ".vfb",
        ".vfont", ".xfont", ".fon", ".fnt", ".otb", ".tfm"
    ]
}

let me know what you think?

mnestorov commented 5 months ago

Hey fabiendostie,

Thank you for your contribution. Everything looks great and I will be happy to implement your improvements. If you have any other suggestions, feel free to share and/or add them as well. So if you want you can create a pull request.

Regards, Martin

fabiendostie commented 5 months ago

Actually it would be nice to have an option to select whether to have the script sort stuff in a dated subfolder or not or even ask if we want it in a weekly folder.

Also there should be a mechanism to deal with new unknown file extension where it would ask the user if it should create a new category or add the extension in an existing category in the list. And a mechanism to deal with folders. Should it regroup folders into a single folder named "folders"

I'll look into this and get back.

fabiendostie commented 5 months ago

i just made some change to my branch of the code: main.py

import os
import shutil
import time
import re
import logging
from datetime import datetime, timedelta
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from config import download_folder, classification_rules, GREEN, RESET, log_filename, DAYS_BEFORE_ARCHIVE, ARCHIVE_ACTION, ARCHIVE_FOLDER
from tqdm import tqdm

# Configure logging
logging.basicConfig(filename=log_filename, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def ask_sorting_preference():
    print("Do you want to sort files into subfolders based on their dates? (yes/no)")
    user_input = input().strip().lower()
    return user_input == "yes"

def ask_for_new_extension_category(unknown_extension):
    print(f"New file extension detected: {unknown_extension}. Into which category should it be sorted?")
    category = input().strip()
    return category

def update_classification_rules(extension, category):
    if category in classification_rules:
        classification_rules[category].append(extension)
    else:
        classification_rules[category] = [extension]
    with open("config.py", "w") as config_file:
        config_content = 'classification_rules = {\\n'
        for cat, extensions in classification_rules.items():
            extensions_str = ', '.join([f'"{ext}"' for ext in extensions])
            config_content += f'    "{cat}": [{extensions_str}],\\n'
        config_content += '}\\n'
        config_file.write(config_content)

def classify_and_move_existing_files_and_folders(sorting_preference_date_based):
    script_created_dirs = [os.path.join(download_folder, category) for category in classification_rules.keys()]
    if sorting_preference_date_based:
        today = datetime.today().strftime('%Y-%m-%d')
        script_created_dirs += [os.path.join(dir, today) for dir in script_created_dirs]

    folders_path = os.path.join(download_folder, "Folders")
    script_created_dirs.append(folders_path)

    if not os.path.exists(folders_path):
        os.makedirs(folders_path)

    for item in os.listdir(download_folder):
        item_path = os.path.join(download_folder, item)
        if os.path.isfile(item_path):
            file_name = os.path.basename(item_path)
            file_extension = os.path.splitext(file_name)[1].lower()
            destination_category = None
            for category, extensions in classification_rules.items():
                if file_extension in extensions:
                    destination_category = category
                    break
            if destination_category is None:
                destination_category = "Uncategorized"
            destination_path = os.path.join(download_folder, destination_category)
            if sorting_preference_date_based:
                destination_path = os.path.join(destination_path, today)
            if not os.path.exists(destination_path):
                os.makedirs(destination_path)
            shutil.move(item_path, os.path.join(destination_path, file_name))
            print(f"Moved file {file_name} to {destination_path}")
        elif os.path.isdir(item_path) and item_path not in script_created_dirs:
            shutil.move(item_path, os.path.join(folders_path, item))
            print(f"Moved folder {item} to {folders_path}")

class MyHandler(FileSystemEventHandler):
    def on_modified(self, event):
        if not event.is_directory:
            self.process(event.src_path)

    def process(self, file_path):
        file_name = os.path.basename(file_path)
        file_extension = os.path.splitext(file_name)[1].lower()
        if file_extension not in {ext for exts in classification_rules.values() for ext in exts}:
            destination_category = ask_for_new_extension_category(file_extension)
            update_classification_rules(file_extension, destination_category)
        print(f"File {file_name} processed.")

def main():
    sorting_preference_date_based = ask_sorting_preference()
    print("Organizing existing files and folders...")
    classify_and_move_existing_files_and_folders(sorting_preference_date_based)

    print("Starting file monitoring...")
    observer = Observer()
    event_handler = MyHandler()
    observer.schedule(event_handler, path=download_folder, recursive=True)
    observer.start()
    print("Monitoring started. Press Ctrl+C to stop.")
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()
    print("Monitoring stopped.")

if __name__ == '__main__':
    main()

config.py

# Set the path to your downloads folder
download_folder = '/Users/lefab/Downloads'

# Define the classification rules
classification_rules = {
    "Audio": [
        ".mp3", ".wav", ".flac", ".aac", ".ogg", ".wma", ".aiff", ".au", ".m4a", ".mp2",
        ".mpc", ".opus", ".ra", ".raw", ".vox", ".wv", ".webm", ".8svx", ".cda", ".mid",
        ".midi", ".mod", ".mpa", ".oga", ".s3m", ".spx", ".3ga", ".aax", ".ac3", ".amr",
        ".ape", ".asf", ".ast", ".awb", ".dts", ".flp", ".gsm", ".iklax", ".ivs", ".m3u",
        ".m4b", ".m4r", ".mmf", ".msv", ".nmf", ".nsf", ".nwc", ".pcm", ".qcp", ".tta"
    ],
    "Video": [
        ".mp4", ".avi", ".flv", ".mov", ".wmv", ".mkv", ".m4v", ".mpg", ".mpeg", ".vob",
        ".rmvb", ".ogv", ".3gp", ".3g2", ".drc", ".gifv", ".m2v", ".ogx", ".svi", ".yuv",
        ".264", ".3gpp", ".asx", ".bik", ".braw", ".divx", ".dv", ".evo", ".f4v", ".flc",
        ".h264", ".hevc", ".m2ts", ".m8", ".mnv", ".mts", ".nsv", ".nuv", ".pva", ".r3d",
        ".rl2", ".roq"
    ],
    "Images": [
        ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".psd", ".eps", ".ai", ".indd",
        ".raw", ".cr2", ".nef", ".orf", ".sr2", ".dwg", ".dxf", ".heif", ".ico",
        ".jng", ".jxl", ".pbm", ".pcx", ".pict", ".apng", ".avif", ".bpg", ".cgm",
        ".cmx", ".dib", ".djv", ".flif", ".hdr", ".hrz", ".ilbm", ".lbm", ".miff",
        ".niff", ".nol", ".pam", ".pcd", ".pgm", ".ppm", ".ras", ".sgi", ".tga", ".wbmp",
        ".xpm"
    ],
    "Vectorial Images": [
        ".svg", ".cdr", ".eps", ".ai", ".afdesign", ".avit", ".e2d", ".fig", ".sk", ".sk1", ".sxd", ".v2d",
        ".vml", ".wmf", ".xar", ".xcf", ".vsd", ".ppt", ".odg", ".svgz", ".drw", ".emf",
        ".gt2", ".hpgl", ".iges", ".mgcb", ".plt", ".rdp", ".sda", ".sdr", ".stl", ".svf",
        ".swf", ".tikz", ".wmz", ".xaml", ".xd", ".xmind", ".3dv", ".amf", ".art", ".asc",
        ".bvh"
    ],
    "Installers": [
        ".exe", ".msi", ".dmg", ".pkg", ".deb", ".rpm", ".appimage", ".run", ".bat", ".cmd",
        ".bin", ".app", ".gadget", ".jar", ".wsf", ".aam", ".air", ".appx", ".awb", ".crx",
        ".ipk", ".isu", ".job", ".jse", ".tar.gz", ".tgz", ".bz2", ".lz", ".lzma", ".lzo",
        ".xz", ".z", ".7zip", ".ace", ".afa", ".alz", ".arc", ".arj", ".bz", ".cabinet",
        ".cpio", ".dar", ".dd", ".ear", ".gca", ".ha"
    ],
    "Documents": [
        ".doc", ".docx", ".pdf", ".txt", ".odt", ".rtf", ".xls", ".xlsx", ".pptx",
        ".odp", ".ods", ".md", ".epub", ".djvu", ".mobi",
        ".azw", ".azw3", ".fb2", ".ibooks", ".cbr", ".cbz", ".abw", ".ans", ".asc", ".aww",
        ".ccf", ".chm", ".clkw", ".docm", ".dot", ".dotx", ".egnt", ".fdx", ".ftm", ".ftx",
        ".gdoc", ".hwp", ".hwpml", ".log", ".lwp", ".mbp", ".me", ".nbp", ".neis", ".nq"
    ],
    "Archives": [
        ".zip", ".rar", ".7z", ".tar", ".gz", ".apk", ".arj",
        ".cab", ".iso", ".jar", ".part", ".pea", ".s7z", ".sit",
        ".sitx", ".zipx", ".zoo", ".war", ".cdx", ".cso",
        ".dgc", ".hki", ".ice", ".j", ".lha", ".lzh",
        ".lzx", ".pak", ".rar5", ".rk", ".sen", ".sfx", ".shar", ".sqx", ".uue", ".warc"
    ],
    "Programming": [
        ".py", ".js", ".java", ".c", ".cpp", ".cs", ".php", ".rb", ".swift", ".go", ".ts",
        ".pl", ".lua", ".groovy", ".scala", ".rs", ".kt", ".m", ".dart", ".pas", ".asm",
        ".vbs", ".s", ".h", ".hpp", ".ada", ".adb", ".ads", ".agda", ".asmx", ".awk",
        ".bash", ".bsh", ".cls", ".cob", ".coffee", ".cppm", ".csx", ".cu", ".cuh",
        ".d", ".erl", ".f", ".f90", ".f95", ".fs", ".gml", ".hcl", ".json", ".hs"
    ],
    "Web": [
        ".html", ".css", ".php", ".asp", ".jsp", ".aspx", ".cgi", ".xml", ".ajax",
        ".cfm", ".html5", ".xhtml", ".rss", ".atom", ".scss", ".less", ".sass", ".wasm", ".vue",
        ".svelte", ".erb", ".haml", ".handlebars", ".hbs", ".jspf", ".liquid", ".mustache",
        ".phtml", ".rhtml", ".slim", ".tmpl", ".twig", ".volt", ".xht", ".xsl", ".yaml", ".yml",
        ".do", ".jhtm", ".jspx", ".jst", ".lda", ".rjs", ".tld"
    ],
    "Databases": [
        ".db", ".mdb", ".accdb", ".sqlite", ".dbf", ".mdf", ".ora", ".fdb", ".db2",
        ".ib", ".myd", ".myi", ".frm", ".odb", ".pdb", ".sqlitedb", ".sqlite3", ".dat", ".db3",
        ".sdb", ".s3db", ".dbk", ".dbx", ".dcb", ".fmp", ".fp5", ".fp7", ".gdb", ".kdb", ".accde",
        ".adp", ".daf", ".edb", ".fmp12", ".frx", ".itdb", ".mdbhtml", ".ndf", ".nsf", ".nv2",
        ".nyf", ".ora", ".pdm", ".prc", ".tdb"
    ],
    "Datasets": [".csv", ".xlsx", ".sql"
    ],
    "Downloader files": [".torrent", ".nzb"
    ],
    "Fonts": [
        ".ttf", ".otf", ".woff", ".woff2", ".eot", ".sfnt", ".font", ".pfb", ".pfm", ".afm",
        ".bin", ".cff", ".dfont", ".gst", ".pfa", ".sfd", ".std", ".svg", ".ttc", ".vfb",
        ".vfont", ".xfont", ".fon", ".fnt", ".otb", ".tfm"
    ]
}

# Automatic folder cleanup
DAYS_BEFORE_ARCHIVE = 30  # You can change this value based on your requirement
ARCHIVE_ACTION = "move"  # "move" or "delete"
ARCHIVE_FOLDER = "Archived"

# ANSI escape code for green text
GREEN = '\033[32m'
RESET = '\033[0m'

# Log filename
log_filename = 'app.log'