Closed reynir closed 9 months ago
This all looks like it's heading in the right direction, and suitable for a major version number revision of the library. Do you want to break it up into smaller chunks @reynir or would you prefer a review once marked as completed?
This looks good, what I wonder is whether read/write (i.e. really_read/really_write) should be able to return an error which is propagated to read/write?
I think this is ready for review now.
I agree that it is not very neat to mix result and exceptions from read and write operations. I think it is worth investigating. Maybe it's even worth restructuring it as an engine asking the caller to do the read and write operations?! Since this PR has been sitting for too long I think we should do it in another PR.
Is there someone willing to review? Since I co-authored the commits yesterday (paired with @reynir), I think it makes sense that someone else does a review -- maybe @MisterDA or @avsm or @samoht?
I agree with @reynir that the error story (of a failing read/write) is best addressed in a separate PR, but maybe before a release. We (@reynir and myself) also worked on a patch for obuilder
to use the API proposed here.
The above mentioned patch is:
diff --git a/lib/tar_transfer.ml b/lib/tar_transfer.ml
index c16cbeb..c40303a 100644
--- a/lib/tar_transfer.ml
+++ b/lib/tar_transfer.ml
@@ -26,23 +26,14 @@ module Tar_lwt_unix = struct
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*)
- module Writer = struct
- type out_channel = Lwt_unix.file_descr
- type 'a t = 'a Lwt.t
- let really_write fd = Lwt_cstruct.(complete (write fd))
- end
-
- module HW = Tar.HeaderWriter(Lwt)(Writer)
-
let write_block ?level (header: Tar.Header.t) (body: Lwt_unix.file_descr -> unit Lwt.t) (fd : Lwt_unix.file_descr) =
- HW.write ?level header fd
- >>= fun () ->
+ HeaderWriter.write ?level header fd >>= fun _ ->
body fd >>= fun () ->
- Writer.really_write fd (Tar.Header.zero_padding header)
+ really_write fd (Tar.Header.zero_padding header)
let write_end (fd: Lwt_unix.file_descr) =
- Writer.really_write fd Tar.Header.zero_block >>= fun () ->
- Writer.really_write fd Tar.Header.zero_block
+ really_write fd Tar.Header.zero_block >>= fun () ->
+ really_write fd Tar.Header.zero_block
end
let copy_to ~dst src =
@@ -88,7 +79,7 @@ let copy_symlink ~src ~target ~dst ~to_untar ~user =
?user_id ?group_id ?uname ?gname
dst 0L
in
- Tar_lwt_unix.write_block ~level hdr (fun _ -> Lwt.return_unit) to_untar
+ Tar_lwt_unix.HeaderWriter.write ~level hdr to_untar >|= ignore
let rec copy_dir ~src_dir ~src ~dst ~(items:(Manifest.t list)) ~to_untar ~user =
Log.debug(fun f -> f "Copy dir %S -> %S" src dst);
@@ -101,8 +92,9 @@ let rec copy_dir ~src_dir ~src ~dst ~(items:(Manifest.t list)) ~to_untar ~user =
?user_id ?group_id ?uname ?gname
(dst ^ "/") 0L
in
- Tar_lwt_unix.write_block ~level hdr (fun _ -> Lwt.return_unit) to_untar
- end >>= fun () -> send_dir ~src_dir ~dst ~to_untar ~user items
+ Tar_lwt_unix.HeaderWriter.write ~level hdr to_untar
+ end >>= fun _ ->
+ send_dir ~src_dir ~dst ~to_untar ~user items
and send_dir ~src_dir ~dst ~to_untar ~user items =
items |> Lwt_list.iter_s (function
@@ -141,6 +133,38 @@ let send_file ~src_dir ~src_manifest ~dst ~user ~to_untar =
end >>= fun () ->
Tar_lwt_unix.write_end to_untar
+let copy_n ifd ofd n =
+ let open Tar_lwt_unix in
+ let block_size = 32768 in
+ let buffer = Cstruct.create block_size in
+ let rec loop remaining =
+ if remaining = 0L then Lwt.return_unit else begin
+ let this = Int64.(to_int (min (of_int block_size) remaining)) in
+ let block = Cstruct.sub buffer 0 this in
+ really_read ifd block >>= fun () ->
+ really_write ofd block >>= fun () ->
+ loop (Int64.(sub remaining (of_int this)))
+ end in
+ loop n
+
+let tar_transform ?level f ifd ofd =
+ let open Tar_lwt_unix in
+ let rec loop global () = HeaderReader.read ~global ifd >>= function
+ | Error `Eof -> Lwt.return_unit
+ | Error e -> Log.err (fun m -> m "received error %a when reading" Tar.pp_error e); Lwt.return_unit
+ | Ok (header', global') ->
+ let header = f header' in
+ let body = fun _ -> copy_n ifd ofd header.Tar.Header.file_size in
+ (match global' with
+ | Some g when global <> global' ->
+ HeaderWriter.write_global_extended_header g ofd >|= ignore
+ | _ -> Lwt.return_unit) >>= fun () ->
+ write_block ?level header body ofd >>= fun () ->
+ skip ifd (Tar.Header.compute_zero_padding_length header') >>= fun () ->
+ loop global' () in
+ loop None () >>= fun () ->
+ write_end ofd
+
let transform ~user fname hdr =
(* Make a copy to erase unneeded data from the tar headers. *)
let hdr' = Tar.Header.(make ~file_mode:hdr.file_mode ~mod_time:hdr.mod_time hdr.file_name hdr.file_size) in
@@ -192,7 +216,7 @@ and transform_files ~from_tar ~src_manifest ~dst_dir ~user ~to_untar =
| exception Not_found -> Fmt.failwith "Could not find mapping for %s" file_name
| file_name -> file_name
in
- Tar_lwt_unix.Archive.transform ~level (transform ~user fname) from_tar to_untar
+ tar_transform ~level (transform ~user fname) from_tar to_untar
let transform_file ~from_tar ~src_manifest ~dst ~user ~to_untar =
let dst = remove_leading_slashes dst in
@@ -211,7 +235,7 @@ let transform_file ~from_tar ~src_manifest ~dst ~user ~to_untar =
| exception Not_found -> Fmt.failwith "Could not find mapping for %s" file_name
| file_name -> file_name
in
- Tar_lwt_unix.Archive.transform ~level (fun hdr ->
+ tar_transform ~level (fun hdr ->
let hdr' = transform ~user fname hdr in
Log.debug (fun f -> f "Copying %s -> %s" hdr.Tar.Header.file_name hdr'.Tar.Header.file_name);
hdr')
And as you can tell, the error handling is pretty poor (as far as I can tell, the error handling before wasn't very nice either). I really think we need a reasonable error story.
I also propose to use lseek
as skip
, and have users that want to use non-seekable file descriptors with ocaml-tar implement their workarounds. (Instead of using read
to support all the possible file descriptors - with a huge slowdown for 99,999% use cases.)
Thanks for your review @kit-ty-kate. I pushed dd73851 which distinguishes `Eof
from other `Fatal
errors, as you proposed.
On your higher-level note, I agree that a nicer API would be great to have. I doubt the current error-choking interface (where read
and write
can never fail) is the path forward, and would like to address this in a separate PR before a next release.
I squash-merged the PR as I think it is good, and it is better to have it merged and do more changes in separate PRs than leave it open for any longer.
Thank you for your review @kit-ty-kate. I agree about `Eof
(and thank you @hannesm for committing it).
I also agree about your second point. What I find challenging is that you may read a tar header and then decide to 1) read the file contents, or 2) skip the file contents (or 3) not do any further reading, I guess). There's an implicit assumption that when you read the next header the caller has already read or skipped the file contents and the NUL-byte padding. After discussing with @hannesm I'd like to explore a more IO-decoupled approach where hopefully it is easier to provide a sensible iterator interface. This would, as @hannesm mentions, make it easier to deal with IO operations that may raise exceptions (such as the ones from Unix).
This is a continuation of #119 and fixes #120, #125, #107 and probably also #71.
Tar.HEADERREADER
andTar.HEADERWRITER
module types, and renameTar.{READER,WRITER}.t
toio
.write_global
function for writing a globalTar.Header.Extended.t
. This allows writing an archive with a PAX comment and nothing else.This is still work in progress.
What I'd like to do as well: