From 82ca8804a2e6b427728734d3f8eb9f3ea37c6857 Mon Sep 17 00:00:00 2001 From: sharkdp Date: Sat, 31 Aug 2019 19:30:24 +0200 Subject: [PATCH] Handle non-unicode characters in the preprocessor --- .../syntaxes/show-nonprintable.sublime-syntax | 4 + src/clap_app.rs | 3 +- src/preprocessor.rs | 128 ++++++++++++++---- src/printer.rs | 46 +++---- 4 files changed, 126 insertions(+), 55 deletions(-) diff --git a/assets/syntaxes/show-nonprintable.sublime-syntax b/assets/syntaxes/show-nonprintable.sublime-syntax index d9647155..179d88b6 100644 --- a/assets/syntaxes/show-nonprintable.sublime-syntax +++ b/assets/syntaxes/show-nonprintable.sublime-syntax @@ -25,3 +25,7 @@ contexts: scope: entity.other.attribute-name.show-nonprintable.escape - match: "␈" scope: entity.other.attribute-name.show-nonprintable.backspace + - match: "\\\\x[A-Z0-9][A-Z0-9]" + scope: comment.block.show-nonprintable.backspace + - match: "\\\\u\\{[a-z0-9]+\\}" + scope: comment.block.show-nonprintable.backspace diff --git a/src/clap_app.rs b/src/clap_app.rs index 08d6b632..f71bd2fa 100644 --- a/src/clap_app.rs +++ b/src/clap_app.rs @@ -172,7 +172,8 @@ pub fn build_app(interactive_output: bool) -> ClapApp<'static, 'static> { .help("Show non-printable characters (space, tab, newline, ..).") .long_help( "Show non-printable characters like space, tab or newline. \ - Use '--tabs' to control the width of the tab-placeholders.", + Use '--tabs' to control the width of the tab-placeholders. \ + This option can also be used to print binary files.", ), ) .arg( diff --git a/src/preprocessor.rs b/src/preprocessor.rs index f3e2f4d1..cfe0ffd6 100644 --- a/src/preprocessor.rs +++ b/src/preprocessor.rs @@ -33,41 +33,115 @@ pub fn expand_tabs(line: &str, width: usize, cursor: &mut usize) -> String { buffer } -pub fn replace_nonprintable(input: &str, tab_width: usize) -> String { +fn try_parse_utf8_char(input: &[u8]) -> Option<(char, usize)> { + let str_from_utf8 = |seq| std::str::from_utf8(seq).ok(); + + let decoded = None + .or(input.get(0..1).and_then(str_from_utf8).map(|c| (c, 1))) + .or(input.get(0..2).and_then(str_from_utf8).map(|c| (c, 2))) + .or(input.get(0..3).and_then(str_from_utf8).map(|c| (c, 3))) + .or(input.get(0..4).and_then(str_from_utf8).map(|c| (c, 4))); + + let decoded_char = decoded.map(|(seq, n)| (seq.chars().next().unwrap(), n)); + + decoded_char +} + +pub fn replace_nonprintable(input: &[u8], tab_width: usize) -> String { let mut output = String::new(); let tab_width = if tab_width == 0 { 4 } else { tab_width }; - for chr in input.chars() { - match chr { - // space - ' ' => output.push('•'), - // tab - '\t' => { - if tab_width == 1 { - output.push('↹'); - } else { - output.push('├'); - output.push_str(&"─".repeat(tab_width - 2)); - output.push('┤'); + let mut idx = 0; + let len = input.len(); + while idx < len { + if let Some((chr, skip_ahead)) = try_parse_utf8_char(&input[idx..]) { + idx += skip_ahead; + + match chr { + // space + ' ' => output.push('•'), + // tab + '\t' => { + if tab_width == 1 { + output.push('↹'); + } else { + output.push('├'); + output.push_str(&"─".repeat(tab_width - 2)); + output.push('┤'); + } } + // line feed + '\x0A' => output.push('␊'), + // carriage return + '\x0D' => output.push('␍'), + // null + '\x00' => output.push('␀'), + // bell + '\x07' => output.push('␇'), + // backspace + '\x08' => output.push('␈'), + // escape + '\x1B' => output.push('␛'), + // printable ASCII + c if c.is_ascii_alphanumeric() + || c.is_ascii_punctuation() + || c.is_ascii_graphic() => + { + output.push(c) + } + // everything else + c => output.push_str(&c.escape_unicode().collect::()), } - // line feed - '\x0A' => output.push('␊'), - // carriage return - '\x0D' => output.push('␍'), - // null - '\x00' => output.push('␀'), - // bell - '\x07' => output.push('␇'), - // backspace - '\x08' => output.push('␈'), - // escape - '\x1B' => output.push('␛'), - // anything else - _ => output.push(chr), + } else { + output.push_str(&format!("\\x{:02X}", input[idx])); + idx += 1; } } output } + +#[test] +fn test_try_parse_utf8_char() { + assert_eq!(try_parse_utf8_char(&[0x20]), Some((' ', 1))); + assert_eq!(try_parse_utf8_char(&[0x20, 0x20]), Some((' ', 1))); + assert_eq!(try_parse_utf8_char(&[0x20, 0xef]), Some((' ', 1))); + + assert_eq!(try_parse_utf8_char(&[0x00]), Some(('\x00', 1))); + assert_eq!(try_parse_utf8_char(&[0x1b]), Some(('\x1b', 1))); + + assert_eq!(try_parse_utf8_char(&[0xc3, 0xa4]), Some(('ä', 2))); + assert_eq!(try_parse_utf8_char(&[0xc3, 0xa4, 0xef]), Some(('ä', 2))); + assert_eq!(try_parse_utf8_char(&[0xc3, 0xa4, 0x20]), Some(('ä', 2))); + + assert_eq!(try_parse_utf8_char(&[0xe2, 0x82, 0xac]), Some(('€', 3))); + assert_eq!( + try_parse_utf8_char(&[0xe2, 0x82, 0xac, 0xef]), + Some(('€', 3)) + ); + assert_eq!( + try_parse_utf8_char(&[0xe2, 0x82, 0xac, 0x20]), + Some(('€', 3)) + ); + + assert_eq!(try_parse_utf8_char(&[0xe2, 0x88, 0xb0]), Some(('∰', 3))); + + assert_eq!( + try_parse_utf8_char(&[0xf0, 0x9f, 0x8c, 0x82]), + Some(('🌂', 4)) + ); + assert_eq!( + try_parse_utf8_char(&[0xf0, 0x9f, 0x8c, 0x82, 0xef]), + Some(('🌂', 4)) + ); + assert_eq!( + try_parse_utf8_char(&[0xf0, 0x9f, 0x8c, 0x82, 0x20]), + Some(('🌂', 4)) + ); + + assert_eq!(try_parse_utf8_char(&[]), None); + assert_eq!(try_parse_utf8_char(&[0xef]), None); + assert_eq!(try_parse_utf8_char(&[0xef, 0x20]), None); + assert_eq!(try_parse_utf8_char(&[0xf0, 0xf0]), None); +} diff --git a/src/printer.rs b/src/printer.rs index 7d8b4002..a53b89b7 100644 --- a/src/printer.rs +++ b/src/printer.rs @@ -1,4 +1,3 @@ -use std::ascii; use std::io::Write; use std::vec::Vec; @@ -232,7 +231,8 @@ impl<'a> Printer for InteractivePrinter<'a> { writeln!( handle, "{}: Binary content from {} will not be printed to the terminal \ - (but will be present if the output of 'bat' is piped).", + (but will be present if the output of 'bat' is piped). You can use 'bat -A' \ + to show the binary file contents.", Yellow.paint("[bat warning]"), input )?; @@ -281,7 +281,7 @@ impl<'a> Printer for InteractivePrinter<'a> { )?; if self.config.output_components.grid() { - if self.content_type.map_or(false, |c| c.is_text()) { + if self.content_type.map_or(false, |c| c.is_text()) || self.config.show_nonprintable { self.print_horizontal_line(handle, '┼')?; } else { self.print_horizontal_line(handle, '┴')?; @@ -292,7 +292,8 @@ impl<'a> Printer for InteractivePrinter<'a> { } fn print_footer(&mut self, handle: &mut dyn Write) -> Result<()> { - if self.config.output_components.grid() && self.content_type.map_or(false, |c| c.is_text()) + if self.config.output_components.grid() + && (self.content_type.map_or(false, |c| c.is_text()) || self.config.show_nonprintable) { self.print_horizontal_line(handle, '┴') } else { @@ -331,32 +332,23 @@ impl<'a> Printer for InteractivePrinter<'a> { line_number: usize, line_buffer: &[u8], ) -> Result<()> { - let mut line = match self.content_type { - None => { - return Ok(()); + let line = if self.config.show_nonprintable { + replace_nonprintable(&line_buffer, self.config.tab_width) + } else { + match self.content_type { + Some(ContentType::BINARY) | None => { + return Ok(()); + } + Some(ContentType::UTF_16LE) => UTF_16LE + .decode(&line_buffer, DecoderTrap::Replace) + .map_err(|_| "Invalid UTF-16LE")?, + Some(ContentType::UTF_16BE) => UTF_16BE + .decode(&line_buffer, DecoderTrap::Replace) + .map_err(|_| "Invalid UTF-16BE")?, + _ => String::from_utf8_lossy(&line_buffer).to_string(), } - Some(ContentType::BINARY) => String::from_utf8( - line_buffer - .as_ref() - .iter() - .map(|b| ascii::escape_default(*b)) - .flatten() - .collect(), - ) - .unwrap(), - Some(ContentType::UTF_16LE) => UTF_16LE - .decode(&line_buffer, DecoderTrap::Replace) - .map_err(|_| "Invalid UTF-16LE")?, - Some(ContentType::UTF_16BE) => UTF_16BE - .decode(&line_buffer, DecoderTrap::Replace) - .map_err(|_| "Invalid UTF-16BE")?, - _ => String::from_utf8_lossy(&line_buffer).to_string(), }; - if self.config.show_nonprintable { - line = replace_nonprintable(&line, self.config.tab_width); - } - let regions = { let highlighter = match self.highlighter { Some(ref mut highlighter) => highlighter,