greymd / teip

Masking tape to help commands "do one thing well"
MIT License
571 stars 19 forks source link

Refactoring for send_byps/send_keep #41

Closed greymd closed 1 year ago

greymd commented 1 year ago

Such the pattern is frequently observed on the code base.

https://github.com/greymd/teip/blob/master/src/procs.rs#L152-L165

        if is_in && !last_is_in {
            ch.send_keep(str_out.to_string())?;
            str_out.clear();
        } else if !is_in && last_is_in {
            ch.send_byps(str_in.to_string())?;
            str_in.clear();
        }
        last_is_in = is_in;
    }
    if last_is_in && !str_in.is_empty() {
        ch.send_byps(str_in)?;
    } else {
        ch.send_keep(str_out)?;
    }

This procedure can be unified to a single function.

greymd commented 1 year ago

I was working this issue on other branch but I noticed that this change degrades performance.

Prerequisites

Prepare a CSV file for benchmarking. https://github.com/greymd/teip/wiki/CSV-Benchmark

Benchmark

teip 2.2.0

$ file $(which teip)
/usr/bin/teip: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=e5ee576d461e79055f66d53ff6d55ff792606721, stripped

$ teip --version
teip 2.2.0

~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null  4.38s user 0.42s system 159% cpu 3.000 total

~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null  4.45s user 0.37s system 161% cpu 2.978 total

~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null  4.44s user 0.35s system 161% cpu 2.966 total

Refactored teip 2.2.0

$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time ./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv >   5.23s user 0.31s system 117% cpu 4.732 total

~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time ./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv >   5.62s user 0.24s system 118% cpu 4.952 total

~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time ./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv >   5.18s user 0.28s system 117% cpu 4.627 total

How refactored

Update char_proc in proc.rs

/// Bypassing character range ( -c )
pub fn char_proc(
    ch: &mut PipeIntercepter,
    line: &Vec<u8>,
    ranges: &Vec<list::ranges::Range>,
) -> Result<(), errors::ChunkSendError> {
    let line = String::from_utf8_lossy(&line).to_string();
    let cs = line.chars();
    let mut ri = 0; // range index
    // Merge consequent characters' range to execute commands as few times as possible.
    for (i, c) in cs.enumerate() {
        // If the current position is out of the range, go to the next range.
        if ranges[ri].high < (i + 1) && (ri + 1) < ranges.len() {
            ri += 1;
        }
        if ranges[ri].low <= (i + 1) && (i + 1) <= ranges[ri].high {
            ch.buf_send_byps(c.to_string())?
        } else {
            ch.buf_send_keep(c.to_string())?
        }
    }
    ch.flush_byps()?;
    ch.flush_keep()?;
    Ok(())
}

Add following statements to chunk.rs

#[derive(Debug)]
pub enum ChunkGroup {
    Keep,   // a string under masking tape. Printed as is.
    Hole,   // A hole on the masking tape. The string in the hole being processed other thread.
}

Add following statements to pipeintercepter.rs

pub struct ChunkBuf {
    keep_buf: String,
    byps_buf: String,
    last_target: Option<ChunkGroup>,
}

impl ChunkBuf {
    fn new() -> ChunkBuf {
        ChunkBuf {
            keep_buf: String::with_capacity(DEFAULT_CAP),
            byps_buf: String::with_capacity(DEFAULT_CAP),
            last_target: None,
        }
    }
}

/// struct for bypassing input and its interface
pub struct PipeIntercepter {
    tx: Sender<Chunk>,
    pipe_writer: BufWriter<Box<dyn Write + Send + 'static>>, // Not used when -s
    handler: Option<JoinHandle<()>>,                         // "option dance"
    line_end: u8,
    solid: bool,
    dryrun: bool,
    chunk_buf: ChunkBuf,
}

Add following statements to pipeintercepter.rs under PipeIntercepter struct

    /// Used as send_keep() but it prevents sending chunk immediately.
    /// Keep the chunk in the buffer until the next Hole is found.
    /// Actually, send_keep() is not called directly but called by buf_send_byps().
    pub fn buf_send_keep(&mut self, msg: String) -> Result<(), errors::ChunkSendError> {
        // append msg to keep_buf
        self.chunk_buf.keep_buf.push_str(&msg);
        match self.chunk_buf.last_target {
            Some(ChunkGroup::Keep) => {
                return Ok(());
            }
            Some(ChunkGroup::Hole) => {
                match self.send_byps(self.chunk_buf.byps_buf.clone()) {
                    Ok(_) => {
                        // clear keep_buf
                        self.chunk_buf.byps_buf.clear();
                        self.chunk_buf.last_target = Some(ChunkGroup::Keep);
                        return Ok(());
                    }
                    Err(e) => return Err(e),
                }
            }
            // Initialize
            None => {
                self.chunk_buf.last_target = Some(ChunkGroup::Keep);
                return Ok(());
            }
        }
    }

    /// Used as send_byps() but it prevents sending chunk immediately.
    /// Keep the chunk in the buffer until the next Keep is found.
    /// Actually, send_byps() is not called directly but called by buf_send_keep().
    pub fn buf_send_byps(&mut self, msg: String) -> Result<(), errors::ChunkSendError> {
        // append msg to byps_buf
        self.chunk_buf.byps_buf.push_str(&msg);
        match self.chunk_buf.last_target {
            Some(ChunkGroup::Keep) => {
                match self.send_keep(self.chunk_buf.keep_buf.clone()) {
                    Ok(_) => {
                        // clear keep_buf
                        self.chunk_buf.keep_buf.clear();
                        self.chunk_buf.last_target = Some(ChunkGroup::Hole);
                        return Ok(());
                    }
                    Err(e) => return Err(e),
                }
            }
            Some(ChunkGroup::Hole) => {
                return Ok(());
            }
            // Initialize
            None => {
                self.chunk_buf.last_target = Some(ChunkGroup::Hole);
                return Ok(());
            }
        }
    }

    // send remaining chunks to be bypassed in the buffer
    pub fn flush_byps(&mut self) -> Result<(), errors::ChunkSendError> {
        if !self.chunk_buf.byps_buf.is_empty() {
            match self.send_byps(self.chunk_buf.byps_buf.clone()) {
                Ok(_) => {
                    // clear keep_buf
                    self.chunk_buf.byps_buf.clear();
                    self.chunk_buf.last_target = None;
                    return Ok(());
                }
                Err(e) => return Err(e),
            }
        }
        return Ok(());
    }

    pub fn flush_keep(&mut self) -> Result<(), errors::ChunkSendError> {
        if !self.chunk_buf.keep_buf.is_empty() {
            match self.send_keep(self.chunk_buf.keep_buf.clone()) {
                Ok(_) => {
                    // clear keep_buf
                    self.chunk_buf.keep_buf.clear();
                    self.chunk_buf.last_target = None;
                    return Ok(());
                }
                Err(e) => return Err(e),
            }
        }
        return Ok(());
    }
greymd commented 1 year ago

teip does not sacrifice performance for better maintainability. Therefore, we have decided not to address this Issue.