simonw / files-to-prompt

Concatenate a directory full of files into a single prompt for use with LLMs
Apache License 2.0
244 stars 17 forks source link

i've made an similar tool at https://ib.bsb.br/cat-files #10

Open marioseixas opened 3 months ago

marioseixas commented 3 months ago

#!/bin/bash

# Function to display help information
display_help() {
    echo "Usage: $0 [search_directory] [destination_directory]"
    echo "Extracts text from files in the specified search directory, concatenates them,"
    echo "and saves the result in the specified destination directory."
    echo "If no arguments are provided, default directories are used:"
    echo "  - Search Directory: \$HOME/Documents"
    echo "  - Destination Directory: \$HOME/Desktop"
}

# Check if help argument is provided
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
    display_help
    exit 0
fi

# Define directories and files
search_directory="${1:-$HOME/Documents}"
destination_directory="${2:-$HOME/Desktop}"
final_file="${destination_directory}/final_concatenated_file.txt"
unsupported_file="${destination_directory}/concatenated_contents_unsupported.txt"
temp_file_list="${destination_directory}/temp_file_list.txt"

# Supported file types
supported_types=(
    "text/plain" "text/markdown" "text/x-log" "text/x-srt" "text/x-microdvd" "text/csv"
    "text/xml" "application/json" "application/mbox" "application/vnd.ms-word"
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    "application/vnd.oasis.opendocument.text" "application/vnd.oasis.opendocument.spreadsheet"
    "application/vnd.oasis.opendocument.presentation" "application/pdf" "application/rtf"
    "text/html" "application/x-python" "application/java-archive" "text/javascript"
    "text/x-c++src" "text/x-chdr" "application/x-zip-compressed" "application/x-rar-compressed"
    "application/x-tar" "application/gzip" "application/vnd.ms-powerpoint"
    "application/vnd.openxmlformats-officedocument.presentationml.presentation" "text/x-tex"
)

# Function to check if output is textual
is_output_textual() {
    local file="$1"
    local is_text=0

    # Check if the file contains null bytes
    if grep -q $'\0' "$file"; then
        is_text=0
    else
        # Extract printable strings from the file
        local strings_output=$(strings "$file")
        local strings_length=${#strings_output}
        local file_size=$(stat -c%s "$file")

        # Check if the length of extracted strings is significant
        if [ $strings_length -ge $((file_size / 2)) ]; then
            is_text=1
        else
            is_text=0
        fi
    fi

    if [[ $is_text -eq 0 ]]; then
        # Check if the file MIME type is in the supported types list
        local mime_type=$(file -b --mime-type "$file")
        for supported_type in "${supported_types[@]}"; do
            if [[ "${mime_type}" == "${supported_type}" ]]; then
                is_text=1
                break
            fi
        done
    fi

    if [[ $is_text -eq 0 ]]; then
        return 1
    else
        return 0
    fi
}

# Function to process individual file
process_file() {
    local file="$1"
    local text_file_path="$2"

    if [[ -f "$file" ]]; then
        case "${file##*.}" in
            pdf|docx|html|htm|srt|sub|log|tex|latex|doc|ppt|pptx|zip|rar|tar|gz|odt|ods|odp|rtf|csv|xml|json|py|java|js|cpp|h|md|markdown|txt|eml|mbox)
                echo "Processing file: $file"
                # Convert file to text and check if conversion is successful
                convert_to_text "$file" "$text_file_path"
                if [[ $? -ne 0 ]]; then
                    echo "Error: Failed to convert $file" >&2
                    return 1
                fi
                ;;
            *)
                # Unsupported file types are processed using 'cat' command
                echo "Warning: Unsupported file type '${file##*.}'. Processing using 'cat' command." >&2
                cat "$file" > "$text_file_path" 2>/dev/null

                # Initialize a counter for failed checks
                failed_checks=0

                # Check for null bytes
                if grep -q $'\0' "$text_file_path"; then
                    failed_checks=$((failed_checks + 1))
                fi

                # Check the significance of extracted printable strings
                strings_output=$(strings "$text_file_path")
                strings_length=${#strings_output}
                file_size=$(stat -c%s "$text_file_path")
                if [ $strings_length -lt $((file_size / 2)) ]; then
                    failed_checks=$((failed_checks + 1))
                fi

                # Check the file's MIME type
                mime_type=$(file -b --mime-type "$text_file_path")
                supported_type=0
                for type in "${supported_types[@]}"; do
                    if [[ "${mime_type}" == "${type}" ]]; then
                        supported_type=1
                        break
                    fi
                done
                if [[ $supported_type -eq 0 ]]; then
                    failed_checks=$((failed_checks + 1))
                fi

                # Determine the output file based on the number of failed checks
                if [[ $failed_checks -gt 2 ]]; then
                    echo "Warning: Output of $file failed more than two checks. Appending to unsupported file." >&2
                    echo "<!-- $file -->:" >> "$unsupported_file"
                    echo "\`\`\`" >> "$unsupported_file"
                    cat "$text_file_path" >> "$unsupported_file"
                    echo "\`\`\`" >> "$unsupported_file"
                    rm -f "$text_file_path"
                else
                    echo "Output of $file passed the checks. Appending to final file."
                    echo "<!-- $file -->:" >> "$final_file"
                    echo "\`\`\`" >> "$final_file"
                    cat "$text_file_path" >> "$final_file"
                    echo "\`\`\`" >> "$final_file"
                    rm -f "$text_file_path"
                fi
                return 1
                ;;
        esac

        if ! is_output_textual "$text_file_path"; then
            echo "Warning: Output of $file is not textual. Skipping." >&2
            rm -f "$text_file_path"
            return 1
        fi
    else
        echo "Warning: '$file' is not a regular file. Skipping." >&2
        return 1
    fi

    echo "File $file processed successfully"
    return 0
}

# Function to convert file to text based on its type
convert_to_text() {
    local file="$1"
    local text_file_path="$2"

    case "${file##*.}" in
        pdf)
            echo "Converting $file using pdftotext..."
            pdftotext "$file" "$text_file_path" 2>/dev/null
            ;;
        docx|odt|ods|odp)
            echo "Converting $file using pandoc..."
            pandoc "$file" -t plain -o "$text_file_path" 2>/dev/null
            ;;
        html|htm)
            echo "Converting $file using lynx..."
            lynx -dump -nolist -assume_charset UTF-8 -display_charset UTF-8 "$file" > "$text_file_path" 2>/dev/null
            ;;
        srt|sub|log|tex|latex|py|java|js|cpp|h|md|markdown|txt|eml|mbox)
            echo "Copying $file..."
            cat "$file" > "$text_file_path" 2>/dev/null
            ;;
        doc)
            echo "Converting $file using antiword..."
            antiword "$file" > "$text_file_path" 2>/dev/null
            ;;
        ppt|pptx)
            echo "Converting $file using catppt..."
            catppt "$file" > "$text_file_path" 2>/dev/null
            ;;
        zip|rar|tar|gz)
            echo "Extracting and processing $file..."
            extract_and_process_archive "$file" "$text_file_path"
            ;;
        rtf)
            echo "Converting $file using unrtf..."
            unrtf --text "$file" > "$text_file_path" 2>/dev/null
            ;;
        csv)
            echo "Processing $file using awk..."
            awk -F, '{print}' "$file" > "$text_file_path"
            ;;
        xml)
            echo "Processing $file using xmllint..."
            xmllint --xpath "//text()" "$file" > "$text_file_path" 2>/dev/null
            ;;
        json)
            echo "Converting $file using jq..."
            jq -r '.' "$file" > "$text_file_path" 2>/dev/null
            ;;
    esac

    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to convert $file" >&2
        return 1
    fi

    echo "Conversion of $file completed successfully"
    return 0
}

# Function to extract and process archive files
extract_and_process_archive() {
    local file="$1"
    local text_file_path="$2"
    local temp_dir=$(mktemp -d)

    case "${file##*.}" in
        zip)
            echo "Extracting $file using unzip..."
            unzip -q "$file" -d "$temp_dir"
            ;;
        rar)
            echo "Extracting $file using unrar..."
            unrar x "$file" "$temp_dir"
            ;;
        tar)
            echo "Extracting $file using tar..."
            tar -xf "$file" -C "$temp_dir"
            ;;
        gz)
            echo "Extracting $file using gunzip..."
            gunzip -c "$file" > "$temp_dir/$(basename "$file" .gz)"
            ;;
    esac

    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to extract archive '$file'" >&2
        rm -rf "$temp_dir"
        return 1
    fi

    echo "Extraction of $file completed successfully"

    while IFS= read -r -d '' nested_file; do
        echo "Processing nested file: $nested_file"
        process_file "$nested_file" "$text_file_path"
    done < <(find "$temp_dir" -type f -print0)

    rm -rf "$temp_dir"
    return 0
}

# Function to concatenate files
concatenate_files() {
    local text_file_path="$1"
    local file="$2"

    echo "<!-- $file -->:" >> "$final_file"
    echo "\`\`\`" >> "$final_file"
    cat "$text_file_path" >> "$final_file"
    echo "\`\`\`" >> "$final_file"

    if [[ $? -ne 0 ]]; then
        echo "Error: Failed to concatenate $text_file_path" >&2
        return 1
    fi

    echo "Concatenation of $text_file_path completed successfully"
    return 0
}

# Function to cleanup temporary files
cleanup_temp_files() {
    find "$destination_directory" -type f -name "*.txt" ! -name "$(basename "$final_file")" ! -name "$(basename "$unsupported_file")" -delete
    echo "Cleanup of temporary files completed successfully"
}

# Main script execution starts here
mkdir -p "$search_directory" "$destination_directory"
> "$final_file"
> "$unsupported_file"
find "$search_directory" -type f > "$temp_file_list"

while IFS= read -r file; do
    text_file_name=$(echo "$file" | sed 's/[^a-zA-Z0-9]/_/g')
    text_file_path="${destination_directory}/${text_file_name}.txt"
    if process_file "$file" "$text_file_path"; then
        concatenate_files "$text_file_path" "$file"
    fi
done < "$temp_file_list"

cleanup_temp_files
rm -f "$temp_file_list"

echo "Process completed. All files have been processed and concatenated into ${final_file}."
echo "Unsupported file types have been processed using 'cat' command and concatenated into ${unsupported_file}."