getsentry / utf16string

UTF-16 string types
Apache License 2.0
10 stars 6 forks source link

Add .lines() to WStr<E> #6

Open mr-adult opened 2 weeks ago

mr-adult commented 2 weeks ago

I would like to have a .lines() method available on &WStr. I have already implemented this in my code base, so feel free to steal my code (below) and pull it into your codebase. I can also sign a licensing agreement if you need me to.

pub(crate) trait Utf16StringExtensions<Endianness>
where
    Endianness: ByteOrder + 'static,
{
    /// An iterator over the lines of a string, as string slices.
    ///
    /// Lines are split at line endings that are either newlines (\n) or sequences of a carriage return followed by a line feed (\r\n).
    ///
    /// Line terminators are not included in the lines returned by the iterator.
    ///
    /// Note that any carriage return (\r) not immediately followed by a line feed (\n) does not split a line. These carriage returns are thereby included in the produced lines.
    ///
    /// The final line ending is optional. A string that ends with a final line ending will return the same lines as an otherwise identical string without a final line ending.
    fn lines(&self) -> impl Iterator<Item = &WStr<Endianness>>;
}

impl<Endianness> Utf16StringExtensions<Endianness> for WString<Endianness>
where
    Endianness: ByteOrder + 'static,
{
    /// An iterator over the lines of a string, as string slices.
    ///
    /// Lines are split at line endings that are either newlines (\n) or sequences of a carriage return followed by a line feed (\r\n).
    ///
    /// Line terminators are not included in the lines returned by the iterator.
    ///
    /// Note that any carriage return (\r) not immediately followed by a line feed (\n) does not split a line. These carriage returns are thereby included in the produced lines.
    ///
    /// The final line ending is optional. A string that ends with a final line ending will return the same lines as an otherwise identical string without a final line ending.
    fn lines(&self) -> impl Iterator<Item = &WStr<Endianness>> {
        Lines::new(self)
    }
}

struct Lines<'source, Endianness>
where
    Endianness: ByteOrder + 'static,
{
    source: &'source WStr<Endianness>,
    char_indices: WStrCharIndices<'source, Endianness>,
}

impl<'source, Endianness> Lines<'source, Endianness>
where
    Endianness: ByteOrder + 'static,
{
    fn new(source: &'source WStr<Endianness>) -> Self {
        Self {
            source,
            char_indices: source.char_indices(),
        }
    }
}

impl<'source, Endianness> Iterator for Lines<'source, Endianness>
where
    Endianness: ByteOrder + 'static,
{
    type Item = &'source WStr<Endianness>;

    fn next(&mut self) -> Option<Self::Item> {
        let start = self.char_indices.next()?;
        let mut previous_was_carriage_return = false;

        const BYTE_WIDTH_OF_CARRIAGE_RETURN: usize = 2;

        let mut last_ch_index = None;
        while let Some((ch_index, ch)) = self.char_indices.next() {
            last_ch_index = Some(ch_index);
            match ch {
                '\r' => previous_was_carriage_return = true,
                '\n' => {
                    if previous_was_carriage_return {
                        return Some(&self.source[start.0..(ch_index - BYTE_WIDTH_OF_CARRIAGE_RETURN)]);
                    } else {
                        return Some(&self.source[start.0..ch_index]);
                    }
                }
                _ => previous_was_carriage_return = false,
            }
        }

        if let Some(_) = last_ch_index {
            Some(&self.source[start.0..self.source.len()])
        } else {
            None
        }
    }
}

#[cfg(test)]
mod lines_tests {
    use byteorder::{ByteOrder, BE, LE};
    use utf16string::{WStr, WString};

    use super::Utf16StringExtensions;

    #[test]
    fn can_handle_unix_newlines() {
        let be_wstr = WString::<BE>::from("This is a string with a \n newline");
        let le_wstr = WString::<LE>::from("This is a string with a \n newline");

        let be_lines = be_wstr.lines().collect::<Vec<_>>();
        assert!(be_lines.len() == 2);
        assert!(are_equal("This is a string with a ", be_lines[0]));
        assert!(are_equal(" newline", be_lines[1]));

        let le_lines = le_wstr.lines().collect::<Vec<_>>();
        assert!(le_lines.len() == 2);
        assert!(are_equal("This is a string with a ", le_lines[0]));
        assert!(are_equal(" newline", le_lines[1]));
    }

    #[test]
    fn doesnt_include_ending_unix_newline() {
        let be_wstr = WString::<BE>::from("This is a string \n that terminates in a \\n character. The ending should not be included. \n");
        let le_wstr = WString::<LE>::from("This is a string \n that terminates in a \\n character. The ending should not be included. \n");

        let be_lines = be_wstr.lines().collect::<Vec<_>>();
        assert!(be_lines.len() == 2);
        assert!(are_equal("This is a string ", &be_lines[0]));
        assert!(are_equal(" that terminates in a \\n character. The ending should not be included. ", &be_lines[1]));

        let le_lines = le_wstr.lines().collect::<Vec<_>>();
        assert!(le_lines.len() == 2);
        assert!(are_equal("This is a string ", &le_lines[0]));
        assert!(are_equal(" that terminates in a \\n character. The ending should not be included. ", &le_lines[1]));
    }

    #[test]
    fn can_handle_windows_newlines() {
        let be_wstr = WString::<BE>::from("This is a string with a \r\n newline");
        let le_wstr = WString::<LE>::from("This is a string with a \r\n newline");

        let be_lines = be_wstr.lines().collect::<Vec<_>>();
        assert!(be_lines.len() == 2);
        assert!(are_equal("This is a string with a ", be_lines[0]));
        assert!(are_equal(" newline", be_lines[1]));

        let le_lines = le_wstr.lines().collect::<Vec<_>>();
        assert!(le_lines.len() == 2);
        assert!(are_equal("This is a string with a ", le_lines[0]));
        assert!(are_equal(" newline", le_lines[1]));
    }

    #[test]
    fn doesnt_include_ending_windows_newline() {
        let be_wstr = WString::<BE>::from("This is a string \r\n that terminates in a \\r\\n character. The ending should not be included. \r\n");
        let le_wstr = WString::<LE>::from("This is a string \r\n that terminates in a \\r\\n character. The ending should not be included. \r\n");

        let be_lines = be_wstr.lines().collect::<Vec<_>>();
        assert!(be_lines.len() == 2);
        assert!(are_equal("This is a string ", &be_lines[0]));
        assert!(are_equal(" that terminates in a \\r\\n character. The ending should not be included. ", &be_lines[1]));

        let le_lines = le_wstr.lines().collect::<Vec<_>>();
        assert!(le_lines.len() == 2);
        assert!(are_equal("This is a string ", &le_lines[0]));
        assert!(are_equal(" that terminates in a \\r\\n character. The ending should not be included. ", &le_lines[1]));
    }

    fn are_equal<Endianness: ByteOrder + 'static>(str: &str, wstr: &WStr<Endianness>) -> bool {
        let wstr1_bytes = str.chars().collect::<Vec<_>>();
        let wstr2_bytes = wstr.chars().collect::<Vec<_>>();

        if wstr1_bytes.len() != wstr2_bytes.len() {
            return false;
        }

        for i in 0..wstr1_bytes.len() {
            if wstr1_bytes[i] != wstr2_bytes[i] {
                return false;
            }
        }

        return true;
    }
}
mr-adult commented 2 weeks ago

I would also be happy to submit a pull request with the changes.