mirage / ocaml-tar

Pure OCaml library to read and write tar files
ISC License
54 stars 34 forks source link

Some major changes #127

Closed reynir closed 9 months ago

reynir commented 1 year ago

This is a continuation of #119 and fixes #120, #125, #107 and probably also #71.

This is still work in progress.

What I'd like to do as well:

avsm commented 1 year ago

This all looks like it's heading in the right direction, and suitable for a major version number revision of the library. Do you want to break it up into smaller chunks @reynir or would you prefer a review once marked as completed?

hannesm commented 10 months ago

This looks good, what I wonder is whether read/write (i.e. really_read/really_write) should be able to return an error which is propagated to read/write?

reynir commented 10 months ago

I think this is ready for review now.

I agree that it is not very neat to mix result and exceptions from read and write operations. I think it is worth investigating. Maybe it's even worth restructuring it as an engine asking the caller to do the read and write operations?! Since this PR has been sitting for too long I think we should do it in another PR.

hannesm commented 10 months ago

Is there someone willing to review? Since I co-authored the commits yesterday (paired with @reynir), I think it makes sense that someone else does a review -- maybe @MisterDA or @avsm or @samoht?

I agree with @reynir that the error story (of a failing read/write) is best addressed in a separate PR, but maybe before a release. We (@reynir and myself) also worked on a patch for obuilder to use the API proposed here.

hannesm commented 10 months ago

The above mentioned patch is:

diff --git a/lib/tar_transfer.ml b/lib/tar_transfer.ml
index c16cbeb..c40303a 100644
--- a/lib/tar_transfer.ml
+++ b/lib/tar_transfer.ml
@@ -26,23 +26,14 @@ module Tar_lwt_unix = struct
      OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   *)

-  module Writer = struct
-    type out_channel = Lwt_unix.file_descr
-    type 'a t = 'a Lwt.t
-    let really_write fd = Lwt_cstruct.(complete (write fd))
-  end
-
-  module HW = Tar.HeaderWriter(Lwt)(Writer)
-
   let write_block ?level (header: Tar.Header.t) (body: Lwt_unix.file_descr -> unit Lwt.t) (fd : Lwt_unix.file_descr) =
-    HW.write ?level header fd
-    >>= fun () ->
+    HeaderWriter.write ?level header fd >>= fun _ ->
     body fd >>= fun () ->
-    Writer.really_write fd (Tar.Header.zero_padding header)
+    really_write fd (Tar.Header.zero_padding header)

   let write_end (fd: Lwt_unix.file_descr) =
-    Writer.really_write fd Tar.Header.zero_block >>= fun () ->
-    Writer.really_write fd Tar.Header.zero_block
+    really_write fd Tar.Header.zero_block >>= fun () ->
+    really_write fd Tar.Header.zero_block
 end

 let copy_to ~dst src =
@@ -88,7 +79,7 @@ let copy_symlink ~src ~target ~dst ~to_untar ~user =
       ?user_id ?group_id ?uname ?gname
       dst 0L
   in
-  Tar_lwt_unix.write_block ~level hdr (fun _ -> Lwt.return_unit) to_untar
+  Tar_lwt_unix.HeaderWriter.write ~level hdr to_untar >|= ignore

 let rec copy_dir ~src_dir ~src ~dst ~(items:(Manifest.t list)) ~to_untar ~user =
   Log.debug(fun f -> f "Copy dir %S -> %S" src dst);
@@ -101,8 +92,9 @@ let rec copy_dir ~src_dir ~src ~dst ~(items:(Manifest.t list)) ~to_untar ~user =
         ?user_id ?group_id ?uname ?gname
         (dst ^ "/") 0L
     in
-    Tar_lwt_unix.write_block ~level hdr (fun _ -> Lwt.return_unit) to_untar
-  end >>= fun () -> send_dir ~src_dir ~dst ~to_untar ~user items
+    Tar_lwt_unix.HeaderWriter.write ~level hdr to_untar
+  end >>= fun _ ->
+  send_dir ~src_dir ~dst ~to_untar ~user items

 and send_dir ~src_dir ~dst ~to_untar ~user items =
   items |> Lwt_list.iter_s (function
@@ -141,6 +133,38 @@ let send_file ~src_dir ~src_manifest ~dst ~user ~to_untar =
   end >>= fun () ->
   Tar_lwt_unix.write_end to_untar

+let copy_n ifd ofd n =
+  let open Tar_lwt_unix in
+  let block_size = 32768 in
+  let buffer = Cstruct.create block_size in
+  let rec loop remaining =
+    if remaining = 0L then Lwt.return_unit else begin
+      let this = Int64.(to_int (min (of_int block_size) remaining)) in
+      let block = Cstruct.sub buffer 0 this in
+      really_read ifd block >>= fun () ->
+      really_write ofd block >>= fun () ->
+      loop (Int64.(sub remaining (of_int this)))
+    end in
+  loop n
+
+let tar_transform ?level f ifd ofd =
+  let open Tar_lwt_unix in
+  let rec loop global () = HeaderReader.read ~global ifd >>= function
+    | Error `Eof -> Lwt.return_unit
+    | Error e -> Log.err (fun m -> m "received error %a when reading" Tar.pp_error e); Lwt.return_unit
+    | Ok (header', global') ->
+      let header = f header' in
+      let body = fun _ -> copy_n ifd ofd header.Tar.Header.file_size in
+      (match global' with
+       | Some g when global <> global' ->
+         HeaderWriter.write_global_extended_header g ofd >|= ignore
+       | _ -> Lwt.return_unit) >>= fun () ->
+      write_block ?level header body ofd >>= fun () ->
+      skip ifd (Tar.Header.compute_zero_padding_length header') >>= fun () ->
+      loop global' () in
+  loop None () >>= fun () ->
+  write_end ofd
+
 let transform ~user fname hdr =
   (* Make a copy to erase unneeded data from the tar headers. *)
   let hdr' = Tar.Header.(make ~file_mode:hdr.file_mode ~mod_time:hdr.mod_time hdr.file_name hdr.file_size) in
@@ -192,7 +216,7 @@ and transform_files ~from_tar ~src_manifest ~dst_dir ~user ~to_untar =
     | exception Not_found -> Fmt.failwith "Could not find mapping for %s" file_name
     | file_name -> file_name
   in
-  Tar_lwt_unix.Archive.transform ~level (transform ~user fname) from_tar to_untar
+  tar_transform ~level (transform ~user fname) from_tar to_untar

 let transform_file ~from_tar ~src_manifest ~dst ~user ~to_untar =
   let dst = remove_leading_slashes dst in
@@ -211,7 +235,7 @@ let transform_file ~from_tar ~src_manifest ~dst ~user ~to_untar =
     | exception Not_found -> Fmt.failwith "Could not find mapping for %s" file_name
     | file_name -> file_name
   in
-  Tar_lwt_unix.Archive.transform ~level (fun hdr ->
+  tar_transform ~level (fun hdr ->
       let hdr' = transform ~user fname hdr in
       Log.debug (fun f -> f "Copying %s -> %s" hdr.Tar.Header.file_name hdr'.Tar.Header.file_name);
       hdr')

And as you can tell, the error handling is pretty poor (as far as I can tell, the error handling before wasn't very nice either). I really think we need a reasonable error story.

hannesm commented 10 months ago

I also propose to use lseek as skip, and have users that want to use non-seekable file descriptors with ocaml-tar implement their workarounds. (Instead of using read to support all the possible file descriptors - with a huge slowdown for 99,999% use cases.)

hannesm commented 9 months ago

Thanks for your review @kit-ty-kate. I pushed dd73851 which distinguishes `Eof from other `Fatal errors, as you proposed.

On your higher-level note, I agree that a nicer API would be great to have. I doubt the current error-choking interface (where read and write can never fail) is the path forward, and would like to address this in a separate PR before a next release.

reynir commented 9 months ago

I squash-merged the PR as I think it is good, and it is better to have it merged and do more changes in separate PRs than leave it open for any longer.

Thank you for your review @kit-ty-kate. I agree about `Eof (and thank you @hannesm for committing it).

I also agree about your second point. What I find challenging is that you may read a tar header and then decide to 1) read the file contents, or 2) skip the file contents (or 3) not do any further reading, I guess). There's an implicit assumption that when you read the next header the caller has already read or skipped the file contents and the NUL-byte padding. After discussing with @hannesm I'd like to explore a more IO-decoupled approach where hopefully it is easier to provide a sensible iterator interface. This would, as @hannesm mentions, make it easier to deal with IO operations that may raise exceptions (such as the ones from Unix).