Open zgypa opened 1 month ago
This data is no longer on box, since the upload it was lost. I created a script to extract the creationdate, modification date, and all dates from the files on the drives:
@echo off
setlocal enabledelayedexpansion
REM Parameters for directory and output CSV file
set "directory=%~1"
set "outputFile=%~2"
REM Create the CSV file with headers
> "%outputFile%" echo "DirectoryName","FileName","FilePath","FileSize","HostName","FileCreation","FileModification","LastAccessTime"
REM Loop through all .TIF* and .PDF files in the directory (case-insensitive)
for /r "%directory%" %%f in (*.tif* *.pdf) do (
set "filePath=%%f"
set "fileName=%%~nxf"
set "directoryName=%%~dpf"
set "fileSize=%%~zf"
REM Properly escape the file path for the WMIC command
set "escapedFilePath=!filePath:\=\\!"
REM Append the file metadata using WMIC directly into the CSV, skipping the header
for /f "skip=2 tokens=*" %%a in ('wmic datafile where "name='!escapedFilePath!'" get CreationDate^,LastAccessed^,LastModified /format:csv') do (
<nul set /p ="!directoryName!,!fileName!,!filePath!,!fileSize!,%%a" >> "%outputFile%"
)
)
echo Done!
endlocal
I had to do a BAT file, becuse no premission to execute .PS1 files on the Labs computers. I will then upload the CSV files to box.
The .BAT file was so slow, i mean like it was taking hours, and gobbling up RAM like there's no tomorrow: After passing the 1GB or RAM per process, i decided to abort and write a Python script instead, which worked a LOT better. I put everything in box, but here is the script:
import hashlib
import os
import sys
import argparse
from pathlib import Path
import time
from tqdm import tqdm
import fnmatch
args = None
def get_file_timestamps(file_path):
"""Retrieve file timestamps and format them."""
p = Path(file_path)
stat = p.stat()
creation_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_birthtime))
modification_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_mtime))
access_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(stat.st_atime))
return creation_time, modification_time, access_time
def file_generator(directory):
"""Yield each file path one at a time."""
for root, dirs, files in os.walk(directory):
for name in files:
if args.all:
yield os.path.join(root, name)
elif fnmatch.fnmatch(name.lower(), '*.tif*') or fnmatch.fnmatch(name.lower(), '*.pdf*'):
yield os.path.join(root, name)
def process_directory(directory, output_file=None, add_hash=False):
"""Process files using a generator to handle large datasets efficiently."""
output_stream = open(output_file, 'w') if output_file else sys.stdout
progress = tqdm(file_generator(directory), desc="Processing Files")
try:
output_stream.write("dirname,basename,size,hash,creation,modification,access\n")
hash = ""
for file_path in progress:
try:
creation, modification, access = get_file_timestamps(file_path)
if add_hash:
hash = calculate_file_hash(file_path)
output = f'"{os.path.dirname(file_path)}","{os.path.basename(file_path)}","{os.path.getsize(file_path)}","{hash}","{creation}","{modification}","{access}"\n'
output_stream.write(output)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
finally:
if output_file:
output_stream.close()
def calculate_file_hash(file_path):
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def main():
parser = argparse.ArgumentParser(description="Process a directory and list file timestamps.")
parser.add_argument("directory", type=str, help="The directory to process")
parser.add_argument("-o", "--output", type=str, help="Output file to write the results")
parser.add_argument("-5", "--md5", action="store_true", help="Add md5 hash as well")
parser.add_argument("-a", "--all", action="store_true", help="Include all files, not only TIFFs and PDFs")
global args
args = parser.parse_args()
if not os.path.exists(args.directory):
print(f"Error: The directory {args.directory} does not exist.")
return
process_directory(args.directory, args.output, args.md5)
if __name__ == "__main__":
main()
DateOfSecondaryCapture TimeOfSecondaryCapture are a place to store when the images where first scanned.