seb-m / pyinotify

Monitoring filesystems events with inotify on Linux.
http://github.com/seb-m/pyinotify/wiki
MIT License
2.29k stars 379 forks source link

UnicodeEncodeError: 'utf-8' codec can't encode character '\udc84' in position 81: surrogates not allowed #201

Open Evil2000 opened 2 years ago

Evil2000 commented 2 years ago

I use a very large filesystem which i want to monitor for changes and trigger a command after a timeout. I wrote this script to do that:

import logging
import re
import subprocess
import pyinotify
from time import time as now, sleep

data_dir = '/mnt/data'
files_dir = '/files/'
timeout = 30
scan_cmd = '/usr/local/bin/files_scan.sh --path='
triggered_files = dict()

# Logging
def logger_init():
    """Initialize logger instance."""
    logg = logging.getLogger("ncnotify")
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(
        logging.Formatter("[%(asctime)s %(name)s %(levelname)s] %(message)s"))
    logg.addHandler(console_handler)
    logg.setLevel(10)
    return logg

log = logger_init()

class InotifyEventHandler(pyinotify.ProcessEvent):
    def process_default(self, event):
        log.debug(str(event))
        triggered_files[event.pathname] = now() + timeout

inotify = pyinotify.WatchManager()
notifier = pyinotify.ThreadedNotifier(inotify, InotifyEventHandler())
notifier.start()
mask = pyinotify.IN_CLOSE_WRITE | pyinotify.IN_CREATE | pyinotify.IN_DELETE | pyinotify.IN_DELETE_SELF | pyinotify.IN_MODIFY | pyinotify.IN_MOVE_SELF | pyinotify.IN_MOVED_FROM | pyinotify.IN_MOVED_TO

excl = pyinotify.ExcludeFilter(['^\.'])
try:
    inotify.add_watch(data_dir + files_dir, mask, rec=True, auto_add=True, exclude_filter=excl)
except pyinotify.WatchManagerError as err:
    log.error(err)

log.debug("Watching changes in: %s" % data_dir + files_dir)
while 1:
    if len(triggered_files) > 0:
        tks = list(triggered_files.keys())
        for filepath in tks:
            ts = triggered_files[filepath]
            if now() > ts:
                triggered_files.pop(filepath)
                path = re.sub(data_dir, '', filepath, count=1)
                log.debug("%s" % scan_cmd + path)
                #subprocess.run((scan_cmd + path).split(' '))

notifier.stop()

The filesystem is ext4 so the encoding is UTF-8. But sometimes there are special characters in the filnames. Now python complains:

user@NAS:~$ python3 inotifyscan.py 
Traceback (most recent call last):
  File "/home/user/inotifyscan.py", line 105, in <module>
    inotify.add_watch(data_dir + files_dir, mask, rec=True, auto_add=True, exclude_filter=excl)
  File "/usr/lib/python3/dist-packages/pyinotify.py", line 1916, in add_watch
    exclude_filter)
  File "/usr/lib/python3/dist-packages/pyinotify.py", line 1833, in __add_watch
    wd = self._inotify_wrapper.inotify_add_watch(self._fd, path, mask)
  File "/usr/lib/python3/dist-packages/pyinotify.py", line 153, in inotify_add_watch
    return self._inotify_add_watch(fd, pathname, mask)
  File "/usr/lib/python3/dist-packages/pyinotify.py", line 246, in _inotify_add_watch
    pathname = pathname.encode(sys.getfilesystemencoding())
UnicodeEncodeError: 'utf-8' codec can't encode character '\udc84' in position 81: surrogates not allowed

Watching the filesystem with commandline inotifywatch works without any problem. It seems that pyinotify.py expects strictly UTF-8 filenames. How can I get around this?

Regards. :-)

Evil2000 commented 2 years ago

It seems that there is no easy solution to this. So I decided to change my script to use the output of inotifywait from the commandline. If someone is interested, here's what I got:

import logging
import re
from subprocess import run, Popen, PIPE
import threading
from time import time as now, sleep

data_dir = '/mnt/data'
files_dir = '/files/'
timeout = 300

inotify_cmd = "inotifywait --event=create,close_write,move,delete --quiet --exclude='^\.' --recursive --format='%w' --timefmt='%s' --monitor".split(' ')
inotify_cmd.append(data_dir + files_dir)
scan_cmd = '/usr/local/bin/files_scan.sh --path='.split(' ')

triggered_files = dict()

# Logging
def logger_init(loglevel=logging.NOTSET):
    """Initialize logger instance."""
    logg = logging.getLogger("ncnotify")
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(
        logging.Formatter("[%(asctime)s %(name)s %(levelname)s] %(message)s"))
    logg.addHandler(console_handler)
    logg.setLevel(loglevel)
    return logg

log = logger_init(logging.DEBUG)

def process_inotify():
    proc = Popen(inotify_cmd, stdout=PIPE, encoding='utf-8')
    log.debug("Watching changes in: %s" % data_dir + files_dir)
    for pathname in proc.stdout:
        pathname = pathname.strip("'\r\n")
        triggered_files[pathname] = now() + timeout
        log.debug("[+] %s: %s" % (pathname, triggered_files[pathname]))

t = threading.Thread(target=process_inotify)
t.start()

while 1:
    if len(triggered_files) > 0:
        tks = list(triggered_files.keys())
        for filepath in tks:
            ts = triggered_files[filepath]
            if now() > ts:
                triggered_files.pop(filepath)
                path = re.sub(data_dir, '', filepath, count=1)
                scan_cmd.pop()
                scan_cmd.append("--path=" + path)
                log.debug("%s" % scan_cmd)
                run(scan_cmd)
    sleep(1)

I use this script to trigger a rescan of the corresponding folder by Nextcloud when a file was changed externally. Maybe it's useful to somebody else :-)