diff --git a/CHANGELOG.md b/CHANGELOG.md index a1c2b775..69338d36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ - Add missing mappings for various bash/zsh files, see PR #3262 (@AdamGaskins) - Send all bat errors to stderr by default, see #3336 (@JerryImMouse) - Make --map-syntax target case insensitive to match --language, see #3206 (@keith-hall) -- Correctly determine the end of the line in UTF16LE input #3369 (@keith-hall) +- Correctly determine the end of the line in UTF16LE/BE input #3369 (@keith-hall) ## Other diff --git a/src/input.rs b/src/input.rs index 69b10906..3abfdd82 100644 --- a/src/input.rs +++ b/src/input.rs @@ -267,7 +267,9 @@ impl<'a> InputReader<'a> { }; if content_type == Some(ContentType::UTF_16LE) { - read_utf16le_line(&mut reader, &mut first_line).ok(); + read_utf16_line(&mut reader, &mut first_line, 0x00, 0x0A).ok(); + } else if content_type == Some(ContentType::UTF_16BE) { + read_utf16_line(&mut reader, &mut first_line, 0x0A, 0x00).ok(); } InputReader { @@ -283,26 +285,28 @@ impl<'a> InputReader<'a> { return Ok(true); } - let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; - if self.content_type == Some(ContentType::UTF_16LE) { - return read_utf16le_line(&mut self.inner, buf); + return read_utf16_line(&mut self.inner, buf, 0x00, 0x0A); + } + if self.content_type == Some(ContentType::UTF_16BE) { + return read_utf16_line(&mut self.inner, buf, 0x0A, 0x00); } + let res = self.inner.read_until(b'\n', buf).map(|size| size > 0)?; Ok(res) } } -fn read_utf16le_line(reader: &mut R, buf: &mut Vec) -> io::Result { +fn read_utf16_line(reader: &mut R, buf: &mut Vec, read_until_char: u8, preceded_by_char: u8) -> io::Result { loop { let mut temp = Vec::new(); - let n = reader.read_until(0x00, &mut temp)?; + let n = reader.read_until(read_until_char, &mut temp)?; if n == 0 { // EOF reached break; } buf.extend_from_slice(&temp); - if buf.len() >= 2 && buf[buf.len() - 2] == 0x0A && buf[buf.len() - 1] == 0x00 { + if buf.len() >= 2 && buf[buf.len() - 2] == preceded_by_char && buf[buf.len() - 1] == read_until_char { // end of line found break; } @@ -403,7 +407,7 @@ fn utf16le_issue3367() { assert_eq!(b"\x68\x00\x65\x00\x6C\x00\x6C\x00\x6F\x00\x20\x00\x77\x00\x6F\x00\x72\x00\x6C\x00\x64\x00", &buffer[..]); buffer.clear(); - + let res = reader.read_line(&mut buffer); assert!(res.is_ok()); assert!(!res.unwrap()); diff --git a/tests/examples/test_UTF-16BE-complicated.txt b/tests/examples/test_UTF-16BE-complicated.txt new file mode 100644 index 00000000..f57ddeaa Binary files /dev/null and b/tests/examples/test_UTF-16BE-complicated.txt differ diff --git a/tests/integration_tests.rs b/tests/integration_tests.rs index 35a85623..7579794d 100644 --- a/tests/integration_tests.rs +++ b/tests/integration_tests.rs @@ -1343,6 +1343,18 @@ fn utf16le() { .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); } +#[test] +fn utf16be() { + bat() + .arg("--decorations=always") + .arg("--style=numbers") + .arg("--color=never") + .arg("test_UTF-16BE-complicated.txt") + .assert() + .success() + .stdout(" 1 上一伊刀\n 2 foo bar\n 3 hello world\n"); +} + // Regression test for https://github.com/sharkdp/bat/issues/1922 #[test] fn bom_not_stripped_in_loop_through_mode() {