NHFLO / data

Data repository of NHFLO. Contains data descriptions and mockup data for testing. Does not include the actual data.
GNU Affero General Public License v3.0
1 stars 2 forks source link

Disinsentivise shape and xls format in favor of geojsons and csv #14

Closed bdestombe closed 4 days ago

bdestombe commented 1 week ago

Adjust get all folders to retrieve the folders of the most recent data versions

import os
import pytest
from pathlib import Path
from typing import List, Set

def get_all_folders(root_dir: str = ".") -> List[Path]:
    """
    Get all folders in the given directory recursively.

    Parameters
    ----------
    root_dir : str, optional
        Root directory to start searching from, by default current directory (".")

    Returns
    -------
    List[Path]
        List of Path objects representing all found directories

    Examples
    --------
    >>> folders = get_all_folders("./data")
    >>> print(folders[0])
    ./data/subfolder1
    """
    folders = []
    for root, dirs, _ in os.walk(root_dir):
        for dir_name in dirs:
            folders.append(Path(os.path.join(root, dir_name)))
    return folders

def find_files_by_extension(folder: Path, extensions: Set[str]) -> List[Path]:
    """
    Find all files with given extensions in a folder, case-insensitive.

    Parameters
    ----------
    folder : Path
        Path object representing the folder to search in
    extensions : Set[str]
        Set of file extensions to search for (with or without leading dot)

    Returns
    -------
    List[Path]
        List of Path objects representing found files

    Examples
    --------
    >>> files = find_files_by_extension(Path("./data"), {".csv", "txt"})
    >>> # Will find files like: example.csv, EXAMPLE.CSV, data.TXT
    >>> print(files[0])
    ./data/example.csv

    Notes
    -----
    The search is case-insensitive, so it will find both lowercase and uppercase
    extensions (e.g., both .shp and .SHP).
    """
    matched_files = []
    # Convert all extensions to lowercase and ensure they have a leading dot
    normalized_extensions = {
        (ext if ext.startswith('.') else f'.{ext}').lower()
        for ext in extensions
    }

    # Get all files in the folder and its subfolders
    all_files = folder.glob('**/*')

    # Check each file's extension case-insensitively
    for file_path in all_files:
        if file_path.is_file():  # Ensure it's a file, not a directory
            file_ext = file_path.suffix.lower()  # Get lowercase extension
            if file_ext in normalized_extensions:
                matched_files.append(file_path)

    return matched_files

def test_no_shapefile_or_geopackage():
    """
    Test for absence of shapefile and geopackage formats.

    This test ensures that no shapefiles (.shp, .shx, .dbf, .prj) or 
    geopackage (.gpkg) files are present in any subfolder of the project.
    The check is case-insensitive, so it will catch files like .SHP or .GPKG.

    Raises
    ------
    pytest.fail
        If any prohibited geographic files are found, with details about the files
        and recommendation to use GeoJSON format.

    Notes
    -----
    GeoJSON is preferred over shapefiles and geopackages because:
    - It's text-based and readable
    - Works better with version control
    - Single file format instead of multiple files (like shapefiles)
    """
    forbidden_geo_extensions = {'.shp', '.shx', '.dbf', '.prj', '.gpkg'}
    folders = get_all_folders()

    forbidden_files = []
    for folder in folders:
        files = find_files_by_extension(folder, forbidden_geo_extensions)
        forbidden_files.extend(files)

    if forbidden_files:
        files_str = '\n'.join(str(f) for f in forbidden_files)
        pytest.fail(
            f"Found prohibited geographic files:\n{files_str}\n\n"
            "Recommendation: Convert these files to GeoJSON format for better "
            "Git compatibility and readability."
        )

def test_no_excel_files():
    """
    Test for absence of Excel file formats.

    This test ensures that no Excel files (.xls, .xlsx, .xlsm) are present
    in any subfolder of the project. The check is case-insensitive, so it
    will catch files like .XLSX or .XLS.

    Raises
    ------
    pytest.fail
        If any Excel files are found, with details about the files and
        recommendation to use CSV format.

    Notes
    -----
    CSV is preferred over Excel because:
    - It's plain text and human-readable
    - Better version control capabilities
    - Simpler to process programmatically
    - More universal compatibility
    """
    excel_extensions = {'.xls', '.xlsx', '.xlsm'}
    folders = get_all_folders()

    excel_files = []
    for folder in folders:
        files = find_files_by_extension(folder, excel_extensions)
        excel_files.extend(files)

    if excel_files:
        files_str = '\n'.join(str(f) for f in excel_files)
        pytest.fail(
            f"Found Excel files:\n{files_str}\n\n"
            "Recommendation: Convert these files to CSV format for better "
            "Git compatibility and version control."
        )
bdestombe commented 4 days ago

Solved with #20 and #19