Closed greymd closed 1 year ago
I was working this issue on other branch but I noticed that this change degrades performance.
Prepare a CSV file for benchmarking. https://github.com/greymd/teip/wiki/CSV-Benchmark
$ file $(which teip)
/usr/bin/teip: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=e5ee576d461e79055f66d53ff6d55ff792606721, stripped
$ teip --version
teip 2.2.0
~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null 4.38s user 0.42s system 159% cpu 3.000 total
~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null 4.45s user 0.37s system 161% cpu 2.978 total
~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null 4.44s user 0.35s system 161% cpu 2.966 total
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time ./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > 5.23s user 0.31s system 117% cpu 4.732 total
~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time ./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > 5.62s user 0.24s system 118% cpu 4.952 total
~/repos/greymd/teip [git:feature/extended_list@teip]
$ sudo sh -c 'echo 3 > /proc/sys/vm/drop_caches'; time ./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > /dev/null
./target/release/teip -c 1-10,20- -f 2 -- tr '[:print:]' '@' < test.csv > 5.18s user 0.28s system 117% cpu 4.627 total
Update char_proc in proc.rs
/// Bypassing character range ( -c )
pub fn char_proc(
ch: &mut PipeIntercepter,
line: &Vec<u8>,
ranges: &Vec<list::ranges::Range>,
) -> Result<(), errors::ChunkSendError> {
let line = String::from_utf8_lossy(&line).to_string();
let cs = line.chars();
let mut ri = 0; // range index
// Merge consequent characters' range to execute commands as few times as possible.
for (i, c) in cs.enumerate() {
// If the current position is out of the range, go to the next range.
if ranges[ri].high < (i + 1) && (ri + 1) < ranges.len() {
ri += 1;
}
if ranges[ri].low <= (i + 1) && (i + 1) <= ranges[ri].high {
ch.buf_send_byps(c.to_string())?
} else {
ch.buf_send_keep(c.to_string())?
}
}
ch.flush_byps()?;
ch.flush_keep()?;
Ok(())
}
Add following statements to chunk.rs
#[derive(Debug)]
pub enum ChunkGroup {
Keep, // a string under masking tape. Printed as is.
Hole, // A hole on the masking tape. The string in the hole being processed other thread.
}
Add following statements to pipeintercepter.rs
pub struct ChunkBuf {
keep_buf: String,
byps_buf: String,
last_target: Option<ChunkGroup>,
}
impl ChunkBuf {
fn new() -> ChunkBuf {
ChunkBuf {
keep_buf: String::with_capacity(DEFAULT_CAP),
byps_buf: String::with_capacity(DEFAULT_CAP),
last_target: None,
}
}
}
/// struct for bypassing input and its interface
pub struct PipeIntercepter {
tx: Sender<Chunk>,
pipe_writer: BufWriter<Box<dyn Write + Send + 'static>>, // Not used when -s
handler: Option<JoinHandle<()>>, // "option dance"
line_end: u8,
solid: bool,
dryrun: bool,
chunk_buf: ChunkBuf,
}
Add following statements to pipeintercepter.rs under PipeIntercepter struct
/// Used as send_keep() but it prevents sending chunk immediately.
/// Keep the chunk in the buffer until the next Hole is found.
/// Actually, send_keep() is not called directly but called by buf_send_byps().
pub fn buf_send_keep(&mut self, msg: String) -> Result<(), errors::ChunkSendError> {
// append msg to keep_buf
self.chunk_buf.keep_buf.push_str(&msg);
match self.chunk_buf.last_target {
Some(ChunkGroup::Keep) => {
return Ok(());
}
Some(ChunkGroup::Hole) => {
match self.send_byps(self.chunk_buf.byps_buf.clone()) {
Ok(_) => {
// clear keep_buf
self.chunk_buf.byps_buf.clear();
self.chunk_buf.last_target = Some(ChunkGroup::Keep);
return Ok(());
}
Err(e) => return Err(e),
}
}
// Initialize
None => {
self.chunk_buf.last_target = Some(ChunkGroup::Keep);
return Ok(());
}
}
}
/// Used as send_byps() but it prevents sending chunk immediately.
/// Keep the chunk in the buffer until the next Keep is found.
/// Actually, send_byps() is not called directly but called by buf_send_keep().
pub fn buf_send_byps(&mut self, msg: String) -> Result<(), errors::ChunkSendError> {
// append msg to byps_buf
self.chunk_buf.byps_buf.push_str(&msg);
match self.chunk_buf.last_target {
Some(ChunkGroup::Keep) => {
match self.send_keep(self.chunk_buf.keep_buf.clone()) {
Ok(_) => {
// clear keep_buf
self.chunk_buf.keep_buf.clear();
self.chunk_buf.last_target = Some(ChunkGroup::Hole);
return Ok(());
}
Err(e) => return Err(e),
}
}
Some(ChunkGroup::Hole) => {
return Ok(());
}
// Initialize
None => {
self.chunk_buf.last_target = Some(ChunkGroup::Hole);
return Ok(());
}
}
}
// send remaining chunks to be bypassed in the buffer
pub fn flush_byps(&mut self) -> Result<(), errors::ChunkSendError> {
if !self.chunk_buf.byps_buf.is_empty() {
match self.send_byps(self.chunk_buf.byps_buf.clone()) {
Ok(_) => {
// clear keep_buf
self.chunk_buf.byps_buf.clear();
self.chunk_buf.last_target = None;
return Ok(());
}
Err(e) => return Err(e),
}
}
return Ok(());
}
pub fn flush_keep(&mut self) -> Result<(), errors::ChunkSendError> {
if !self.chunk_buf.keep_buf.is_empty() {
match self.send_keep(self.chunk_buf.keep_buf.clone()) {
Ok(_) => {
// clear keep_buf
self.chunk_buf.keep_buf.clear();
self.chunk_buf.last_target = None;
return Ok(());
}
Err(e) => return Err(e),
}
}
return Ok(());
}
teip does not sacrifice performance for better maintainability. Therefore, we have decided not to address this Issue.
Such the pattern is frequently observed on the code base.
https://github.com/greymd/teip/blob/master/src/procs.rs#L152-L165
This procedure can be unified to a single function.