rust-lang / git2-rs

libgit2 bindings for Rust
https://docs.rs/git2
Apache License 2.0
1.64k stars 380 forks source link

Tree.walk() broken for subtrees with non-utf-8 names #1033

Open bsidhom opened 4 months ago

bsidhom commented 4 months ago

The problem is that the callback wrapper in the walk() implementation assumes utf-8 for the root name. However, this is not required by base Git or libgit2. Other git2-rs methods appear to correctly handle non-utf-8 names by allowing paths to be accessed as byte slices.

Here's a self-contained demonstration of the issue which also shows base git behavior:

Cargo.toml

[package]
name = "bad-encoding"
version = "0.1.0"
edition = "2021"

[dependencies]
anyhow = "1.0.80"
base64 = "0.21.7"
git2 = "0.18.2"
rand = "0.8.5"

src/main.rs:

use std::{
    path::{Path, PathBuf},
    process::Command,
};

use anyhow::{anyhow, Context, Result};
use git2::{
    ObjectType, Oid, Repository, RepositoryInitOptions, Signature, Tree, TreeBuilder,
    TreeWalkResult,
};
use rand::RngCore;

fn main() -> Result<()> {
    let mut rng = rand::thread_rng();
    let temp_dir = create_temp_dir(&mut rng)?;
    println!("Creating repo at {temp_dir:?}");
    let mut options = RepositoryInitOptions::new();
    // NOTE: We use a bare repo to avoid creating a working tree and
    // materializing any pathological file names. Git, however, supports
    // almost-arbitrary filenames internally (excluding embedded NUL and some
    // other special characters, depending on the mode). libgit2 also supports
    // arbitrary encoding with some exceptions:
    // https://github.com/libgit2/libgit2/blob/54bfab93bffa4f98ee37e3730c8ed4061a1bc5db/src/util/fs_path.c#L1579.
    options.bare(true).initial_head("main");
    let repo = Repository::init_opts(&temp_dir, &options)?;
    println!("Populating repo");
    let spec = vec![
        TreeSpec::blob(b"a.txt", b"Toplevel file. This is walkable because it's traversed _before_ the broken subtree."),
        TreeSpec::blob(b"b\xc0.txt", b"Toplevel file. This is walkable as above, but it also has a filename which is not valid utf-8."),
        // Note that the tree below (with an invalid utf-8 name) can be _listed_
        // but not _traversed_ by the current walk implementation.
        TreeSpec::tree(b"c\xc0", vec![
            TreeSpec::blob(b"nested.txt", b"Not walkable due to the parent directory name being non-utf-8 bytes. This will abort the walk."),
        ]),
        TreeSpec::blob(b"d.txt", b"Not walkable due to the broken tree above ending the walk."),
    ];
    populate_repo(&repo, &spec)?;
    println!("Repo populated");
    println!();
    println!("Walking via git2:");
    walk_repo(&repo)?;
    println!();
    println!("Listing committed files using git command:");
    list_repo(temp_dir.as_path())?;
    Ok(())
}

fn create_temp_dir(rng: impl RngCore) -> Result<PathBuf> {
    let mut rng = rng;
    let toplevel = std::env::temp_dir();
    let prefix = "git-repo";
    for _ in 0..1000 {
        let suffix = random_suffix(&mut rng);
        let path = Path::join(toplevel.as_path(), format!("{prefix}-{suffix}.git"));
        match std::fs::create_dir(path.as_path()) {
            Ok(()) => return Ok(path),
            Err(e) => match e.kind() {
                std::io::ErrorKind::AlreadyExists => (),
                _ => return Err(e.into()),
            },
        }
    }
    Err(anyhow!("could not allocate a unique random dir"))
}

fn random_suffix(rng: impl RngCore) -> String {
    use base64::engine::general_purpose::URL_SAFE_NO_PAD;
    use base64::Engine;
    let mut rng = rng;
    let mut buf = [0; 4];
    rng.fill_bytes(&mut buf);
    URL_SAFE_NO_PAD.encode(&buf)
}

fn populate_repo(repo: &Repository, spec: &[TreeSpec]) -> Result<()> {
    let mut root = repo.treebuilder(None)?;
    for item in spec {
        add_to_tree(repo, &mut root, item)?;
    }
    let tree_oid = root.write()?;
    let tree = repo.find_tree(tree_oid)?;
    let signature = Signature::now("foo", "baz@example.com")?;
    let message = "Commit to hold reference";
    repo.commit(Some("HEAD"), &signature, &signature, message, &tree, &[])?;
    Ok(())
}

fn add_to_tree(repo: &Repository, root: &mut TreeBuilder, tree_spec: &TreeSpec) -> Result<()> {
    match tree_spec {
        TreeSpec::Tree { name, children } => {
            let mut tree_builder = repo.treebuilder(None)?;
            for child in children {
                add_to_tree(repo, &mut tree_builder, child)?;
            }
            let oid = tree_builder.write()?;
            root.insert(name, oid, 0o040000)?;
        }
        TreeSpec::Blob { name, contents } => {
            let oid = repo.blob(contents)?;
            root.insert(name, oid, 0o100644)?;
        }
    }
    Ok(())
}

fn walk_repo(repo: &Repository) -> Result<()> {
    let mut revwalk = repo.revwalk()?;
    revwalk.push_head()?;
    for oid in revwalk {
        let oid = oid?;
        describe_commit(&repo, oid).with_context(|| format!("could not inspect commit {oid}"))?;
    }
    Ok(())
}

fn describe_commit(repo: &Repository, oid: Oid) -> Result<()> {
    let commit = repo.find_commit(oid)?;
    let summary = commit
        .summary()
        .ok_or_else(|| anyhow!("could not open commit summary"))
        .with_context(|| format!("reading oid {oid}"))?;
    println!("{oid}: {summary}");
    let tree = commit.tree()?;
    inspect_tree(repo, &tree)?;
    Ok(())
}

fn inspect_tree(repo: &Repository, tree: &Tree) -> Result<()> {
    // Note that the signature below _assumes_ that `root` can always be coerced
    // to `&str` (i.e., utf-8). Due to the implementation, in practice, the
    // result is that subtrees containing non-utf-8 encodings are always
    // silently skipped while walking. Note that basenames (`filename` below)
    // are correctly handled as bytes due to `TreeEntry`'s interface.
    tree.walk(git2::TreeWalkMode::PreOrder, |root, entry| {
        let filename = String::from_utf8_lossy(entry.name_bytes());
        let path = format!("{root}{filename}");
        if let Ok(object) = entry.to_object(repo) {
            match object.kind() {
                Some(ObjectType::Blob) => {
                    let blob = object.as_blob().unwrap();
                    if blob.is_binary() {
                        println!("  {path}: <binary>")
                    } else {
                        println!("  {path}:");
                        let contents = String::from_utf8_lossy(blob.content());
                        for line in contents.lines() {
                            println!("  > {line}");
                        }
                    }
                }
                Some(ObjectType::Tree) => {
                    println!("  {path}: <tree>");
                }
                None => {
                    println!("{path} had no object type");
                }
                _ => {
                    println!("{path} was an unexpected object type (not a blob or tree)");
                }
            }
            // Is it possible to return a result from the callback? It would be
            // much better ergonomically if we could.
            TreeWalkResult::Ok
        } else {
            println!("ERROR: could not get object for {path}");
            TreeWalkResult::Abort
        }
    })?;
    Ok(())
}

fn list_repo(path: &Path) -> Result<()> {
    let mut child = Command::new("git")
        .arg("-C")
        .arg(path)
        .arg("ls-tree")
        .arg("-r")
        .arg("main")
        .spawn()?;
    let result = child.wait()?;
    if result.success() {
        Ok(())
    } else {
        if let Some(code) = result.code() {
            Err(anyhow!("git failed with status code {code}"))
        } else {
            Err(anyhow!("git failed"))
        }
    }
}

enum TreeSpec {
    Tree {
        name: &'static [u8],
        children: Vec<TreeSpec>,
    },
    Blob {
        name: &'static [u8],
        contents: &'static [u8],
    },
}

impl TreeSpec {
    fn tree(name: &'static [u8], children: Vec<TreeSpec>) -> TreeSpec {
        TreeSpec::Tree { name, children }
    }
    fn blob(name: &'static [u8], contents: &'static [u8]) -> TreeSpec {
        TreeSpec::Blob { name, contents }
    }
}

Sample invocation:

> cargo run --release
    Finished release [optimized] target(s) in 0.23s
     Running `target/release/bad-encoding`
Creating repo at "/var/folders/0h/1lcyrkv11ynfjq5w7bt0sl_00000gn/T/git-repo-kDBu5A.git"
Populating repo
Repo populated

Walking via git2:
6cdc2a2f7544639226efe4e872ddd2ec42fb70a0: Commit to hold reference
  a.txt:
  > Toplevel file. This is walkable because it's traversed _before_ the broken subtree.
  b�.txt:
  > Toplevel file. This is walkable as above, but it also has a filename which is not valid utf-8.
  c�: <tree>

Listing committed files using git command:
100644 blob 596c677536cbdd42ed208abb4135afcb95bd967c    a.txt
100644 blob 330e18bbaa623998f17ce703cdcf7158915c6bc5    "b\300.txt"
100644 blob 5fd5bb9cddb4076e8786e90670ba46e621174482    "c\300/nested.txt"
100644 blob 07bd6651bc96c9554dab80f1f15b867d4091116a    d.txt

The example binary does not delete the bare git repo afterward so you can easily inspect it.

bsidhom commented 4 months ago

I'm not sure if it would be acceptable to make a breaking change to fix this or if this should go in as a new method, e.g., Tree::walk2().