rwf2 / multer

An async parser for multipart/form-data content-type in Rust
MIT License
155 stars 35 forks source link

Remove `format!` allocations during parsing #36

Open SergioBenitez opened 3 years ago

SergioBenitez commented 3 years ago

At present, the parser uses format! to generate strings during parsing. These should be removed.

https://github.com/rousan/multer-rs/blob/fe853d64f3bb7763866bdaa90c617a11a6bde505/src/multipart.rs#L255

https://github.com/rousan/multer-rs/blob/fe853d64f3bb7763866bdaa90c617a11a6bde505/src/multipart.rs#L312

https://github.com/rousan/multer-rs/blob/fe853d64f3bb7763866bdaa90c617a11a6bde505/src/buffer.rs#L108

There are a couple of approaches:

  1. Change the scanning code so that the parts of the joined string are searched for individually in succession. IE, find BOUNDARY_EXT and check if the next bytes are boundary.
  2. Cache the formatted strings in something like Storage. This is probably the easier approach.
jaysonsantos commented 1 year ago

This is an option that passes the current tests because they don't seem to cover edge cases:

diff --git a/src/buffer.rs b/src/buffer.rs
index 23f863d..23b0962 100644
--- a/src/buffer.rs
+++ b/src/buffer.rs
@@ -105,56 +105,64 @@ impl<'r> StreamBuffer<'r> {
             return Ok(None);
         }

-        let boundary_deriv = format!("{}{}{}", constants::CRLF, constants::BOUNDARY_EXT, boundary);
-        let b_len = boundary_deriv.len();
+        let mut consumed = 0;
+        let mut position = None;
+
+        for needle in [
+            constants::CRLF.as_bytes(),
+            constants::BOUNDARY_EXT.as_bytes(),
+            boundary.as_bytes(),
+        ] {
+            let window = &self.buf[consumed..];
+            consumed += if let Some(pos) = memchr::memmem::find(window, needle) {
+                position = Some(pos);
+                pos
+            } else {
+                position = None;
+                break;
+            };
+        }

-        match memchr::memmem::find(&self.buf, boundary_deriv.as_bytes()) {
-            Some(idx) => {
-                log::trace!("new field found at {}", idx);
-                let bytes = self.buf.split_to(idx).freeze();
+        match position {
+            Some(_) => {
+                log::trace!("new field found at {}", consumed);
+                let bytes = self.buf.split_to(consumed - 4).freeze();

                 // discard \r\n.
                 self.buf.advance(constants::CRLF.len());

-                Ok(Some((true, bytes)))
+                return Ok(Some((true, bytes)));
             }
             None if self.eof => {
                 log::trace!("no new field found: EOF. terminating");
-                Err(crate::Error::IncompleteFieldData {
+                return Err(crate::Error::IncompleteFieldData {
                     field_name: field_name.map(|s| s.to_owned()),
-                })
-            }
-            None => {
-                let buf_len = self.buf.len();
-                let rem_boundary_part_max_len = b_len - 1;
-                let rem_boundary_part_idx = if buf_len >= rem_boundary_part_max_len {
-                    buf_len - rem_boundary_part_max_len
-                } else {
-                    0
-                };
-
-                log::trace!("no new field found, not EOF, checking close");
-                let bytes = &self.buf[rem_boundary_part_idx..];
-                match memchr::memmem::rfind(bytes, constants::CR.as_bytes()) {
-                    Some(rel_idx) => {
-                        let idx = rel_idx + rem_boundary_part_idx;
-
-                        match memchr::memmem::find(boundary_deriv.as_bytes(), &self.buf[idx..]) {
-                            Some(_) => {
-                                let bytes = self.buf.split_to(idx).freeze();
-
-                                match bytes.is_empty() {
-                                    true => Ok(None),
-                                    false => Ok(Some((false, bytes))),
-                                }
-                            }
-                            None => Ok(Some((false, self.read_full_buf()))),
-                        }
-                    }
-                    None => Ok(Some((false, self.read_full_buf()))),
-                }
+                });
             }
-        }
+            _ => {}
+        };
+
+        log::trace!("no new field found, not EOF, checking close");
+        Ok(Some((false, self.read_full_buf())))
+        // These are not covered by tests
+        // match memchr::memmem::rfind(&self.buf, constants::CR.as_bytes()) {
+        //     Some(rel_idx) => {
+        //         let idx = rel_idx + rem_boundary_part_idx;
+        //
+        //         match memchr::memmem::find(boundary_deriv.as_bytes(),
+        // &self.buf[idx..]) {             Some(_) => {
+        //                 let bytes = self.buf.split_to(idx).freeze();
+        //
+        //                 match bytes.is_empty() {
+        //                     true => Ok(None),
+        //                     false => Ok(Some((false, bytes))),
+        //                 }
+        //             }
+        //             None => Ok(Some((false, self.read_full_buf()))),
+        //         }
+        //     }
+        //     None => Ok(Some((false, self.read_full_buf()))),
+        // }
     }

     pub fn read_full_buf(&mut self) -> Bytes {
SergioBenitez commented 1 year ago

@jaysonsantos Did you see https://github.com/rousan/multer-rs/pull/47#discussion_r1092389720?