From 82ca8804a2e6b427728734d3f8eb9f3ea37c6857 Mon Sep 17 00:00:00 2001
From: sharkdp <davidpeter@web.de>
Date: Sat, 31 Aug 2019 19:30:24 +0200
Subject: [PATCH] Handle non-unicode characters in the preprocessor

---
 .../syntaxes/show-nonprintable.sublime-syntax |   4 +
 src/clap_app.rs                               |   3 +-
 src/preprocessor.rs                           | 128 ++++++++++++++----
 src/printer.rs                                |  46 +++----
 4 files changed, 126 insertions(+), 55 deletions(-)

diff --git a/assets/syntaxes/show-nonprintable.sublime-syntax b/assets/syntaxes/show-nonprintable.sublime-syntax
index d9647155..179d88b6 100644
--- a/assets/syntaxes/show-nonprintable.sublime-syntax
+++ b/assets/syntaxes/show-nonprintable.sublime-syntax
@@ -25,3 +25,7 @@ contexts:
       scope: entity.other.attribute-name.show-nonprintable.escape
     - match: "␈"
       scope: entity.other.attribute-name.show-nonprintable.backspace
+    - match: "\\\\x[A-Z0-9][A-Z0-9]"
+      scope: comment.block.show-nonprintable.backspace
+    - match: "\\\\u\\{[a-z0-9]+\\}"
+      scope: comment.block.show-nonprintable.backspace
diff --git a/src/clap_app.rs b/src/clap_app.rs
index 08d6b632..f71bd2fa 100644
--- a/src/clap_app.rs
+++ b/src/clap_app.rs
@@ -172,7 +172,8 @@ pub fn build_app(interactive_output: bool) -> ClapApp<'static, 'static> {
                 .help("Show non-printable characters (space, tab, newline, ..).")
                 .long_help(
                     "Show non-printable characters like space, tab or newline. \
-                     Use '--tabs' to control the width of the tab-placeholders.",
+                     Use '--tabs' to control the width of the tab-placeholders. \
+                     This option can also be used to print binary files.",
                 ),
         )
         .arg(
diff --git a/src/preprocessor.rs b/src/preprocessor.rs
index f3e2f4d1..cfe0ffd6 100644
--- a/src/preprocessor.rs
+++ b/src/preprocessor.rs
@@ -33,41 +33,115 @@ pub fn expand_tabs(line: &str, width: usize, cursor: &mut usize) -> String {
     buffer
 }
 
-pub fn replace_nonprintable(input: &str, tab_width: usize) -> String {
+fn try_parse_utf8_char(input: &[u8]) -> Option<(char, usize)> {
+    let str_from_utf8 = |seq| std::str::from_utf8(seq).ok();
+
+    let decoded = None
+        .or(input.get(0..1).and_then(str_from_utf8).map(|c| (c, 1)))
+        .or(input.get(0..2).and_then(str_from_utf8).map(|c| (c, 2)))
+        .or(input.get(0..3).and_then(str_from_utf8).map(|c| (c, 3)))
+        .or(input.get(0..4).and_then(str_from_utf8).map(|c| (c, 4)));
+
+    let decoded_char = decoded.map(|(seq, n)| (seq.chars().next().unwrap(), n));
+
+    decoded_char
+}
+
+pub fn replace_nonprintable(input: &[u8], tab_width: usize) -> String {
     let mut output = String::new();
 
     let tab_width = if tab_width == 0 { 4 } else { tab_width };
 
-    for chr in input.chars() {
-        match chr {
-            // space
-            ' ' => output.push('•'),
-            // tab
-            '\t' => {
-                if tab_width == 1 {
-                    output.push('↹');
-                } else {
-                    output.push('├');
-                    output.push_str(&"─".repeat(tab_width - 2));
-                    output.push('┤');
+    let mut idx = 0;
+    let len = input.len();
+    while idx < len {
+        if let Some((chr, skip_ahead)) = try_parse_utf8_char(&input[idx..]) {
+            idx += skip_ahead;
+
+            match chr {
+                // space
+                ' ' => output.push('•'),
+                // tab
+                '\t' => {
+                    if tab_width == 1 {
+                        output.push('↹');
+                    } else {
+                        output.push('├');
+                        output.push_str(&"─".repeat(tab_width - 2));
+                        output.push('┤');
+                    }
                 }
+                // line feed
+                '\x0A' => output.push('␊'),
+                // carriage return
+                '\x0D' => output.push('␍'),
+                // null
+                '\x00' => output.push('␀'),
+                // bell
+                '\x07' => output.push('␇'),
+                // backspace
+                '\x08' => output.push('␈'),
+                // escape
+                '\x1B' => output.push('␛'),
+                // printable ASCII
+                c if c.is_ascii_alphanumeric()
+                    || c.is_ascii_punctuation()
+                    || c.is_ascii_graphic() =>
+                {
+                    output.push(c)
+                }
+                // everything else
+                c => output.push_str(&c.escape_unicode().collect::<String>()),
             }
-            // line feed
-            '\x0A' => output.push('␊'),
-            // carriage return
-            '\x0D' => output.push('␍'),
-            // null
-            '\x00' => output.push('␀'),
-            // bell
-            '\x07' => output.push('␇'),
-            // backspace
-            '\x08' => output.push('␈'),
-            // escape
-            '\x1B' => output.push('␛'),
-            // anything else
-            _ => output.push(chr),
+        } else {
+            output.push_str(&format!("\\x{:02X}", input[idx]));
+            idx += 1;
         }
     }
 
     output
 }
+
+#[test]
+fn test_try_parse_utf8_char() {
+    assert_eq!(try_parse_utf8_char(&[0x20]), Some((' ', 1)));
+    assert_eq!(try_parse_utf8_char(&[0x20, 0x20]), Some((' ', 1)));
+    assert_eq!(try_parse_utf8_char(&[0x20, 0xef]), Some((' ', 1)));
+
+    assert_eq!(try_parse_utf8_char(&[0x00]), Some(('\x00', 1)));
+    assert_eq!(try_parse_utf8_char(&[0x1b]), Some(('\x1b', 1)));
+
+    assert_eq!(try_parse_utf8_char(&[0xc3, 0xa4]), Some(('ä', 2)));
+    assert_eq!(try_parse_utf8_char(&[0xc3, 0xa4, 0xef]), Some(('ä', 2)));
+    assert_eq!(try_parse_utf8_char(&[0xc3, 0xa4, 0x20]), Some(('ä', 2)));
+
+    assert_eq!(try_parse_utf8_char(&[0xe2, 0x82, 0xac]), Some(('€', 3)));
+    assert_eq!(
+        try_parse_utf8_char(&[0xe2, 0x82, 0xac, 0xef]),
+        Some(('€', 3))
+    );
+    assert_eq!(
+        try_parse_utf8_char(&[0xe2, 0x82, 0xac, 0x20]),
+        Some(('€', 3))
+    );
+
+    assert_eq!(try_parse_utf8_char(&[0xe2, 0x88, 0xb0]), Some(('∰', 3)));
+
+    assert_eq!(
+        try_parse_utf8_char(&[0xf0, 0x9f, 0x8c, 0x82]),
+        Some(('🌂', 4))
+    );
+    assert_eq!(
+        try_parse_utf8_char(&[0xf0, 0x9f, 0x8c, 0x82, 0xef]),
+        Some(('🌂', 4))
+    );
+    assert_eq!(
+        try_parse_utf8_char(&[0xf0, 0x9f, 0x8c, 0x82, 0x20]),
+        Some(('🌂', 4))
+    );
+
+    assert_eq!(try_parse_utf8_char(&[]), None);
+    assert_eq!(try_parse_utf8_char(&[0xef]), None);
+    assert_eq!(try_parse_utf8_char(&[0xef, 0x20]), None);
+    assert_eq!(try_parse_utf8_char(&[0xf0, 0xf0]), None);
+}
diff --git a/src/printer.rs b/src/printer.rs
index 7d8b4002..a53b89b7 100644
--- a/src/printer.rs
+++ b/src/printer.rs
@@ -1,4 +1,3 @@
-use std::ascii;
 use std::io::Write;
 use std::vec::Vec;
 
@@ -232,7 +231,8 @@ impl<'a> Printer for InteractivePrinter<'a> {
                 writeln!(
                     handle,
                     "{}: Binary content from {} will not be printed to the terminal \
-                     (but will be present if the output of 'bat' is piped).",
+                     (but will be present if the output of 'bat' is piped). You can use 'bat -A' \
+                     to show the binary file contents.",
                     Yellow.paint("[bat warning]"),
                     input
                 )?;
@@ -281,7 +281,7 @@ impl<'a> Printer for InteractivePrinter<'a> {
         )?;
 
         if self.config.output_components.grid() {
-            if self.content_type.map_or(false, |c| c.is_text()) {
+            if self.content_type.map_or(false, |c| c.is_text()) || self.config.show_nonprintable {
                 self.print_horizontal_line(handle, '┼')?;
             } else {
                 self.print_horizontal_line(handle, '┴')?;
@@ -292,7 +292,8 @@ impl<'a> Printer for InteractivePrinter<'a> {
     }
 
     fn print_footer(&mut self, handle: &mut dyn Write) -> Result<()> {
-        if self.config.output_components.grid() && self.content_type.map_or(false, |c| c.is_text())
+        if self.config.output_components.grid()
+            && (self.content_type.map_or(false, |c| c.is_text()) || self.config.show_nonprintable)
         {
             self.print_horizontal_line(handle, '┴')
         } else {
@@ -331,32 +332,23 @@ impl<'a> Printer for InteractivePrinter<'a> {
         line_number: usize,
         line_buffer: &[u8],
     ) -> Result<()> {
-        let mut line = match self.content_type {
-            None => {
-                return Ok(());
+        let line = if self.config.show_nonprintable {
+            replace_nonprintable(&line_buffer, self.config.tab_width)
+        } else {
+            match self.content_type {
+                Some(ContentType::BINARY) | None => {
+                    return Ok(());
+                }
+                Some(ContentType::UTF_16LE) => UTF_16LE
+                    .decode(&line_buffer, DecoderTrap::Replace)
+                    .map_err(|_| "Invalid UTF-16LE")?,
+                Some(ContentType::UTF_16BE) => UTF_16BE
+                    .decode(&line_buffer, DecoderTrap::Replace)
+                    .map_err(|_| "Invalid UTF-16BE")?,
+                _ => String::from_utf8_lossy(&line_buffer).to_string(),
             }
-            Some(ContentType::BINARY) => String::from_utf8(
-                line_buffer
-                    .as_ref()
-                    .iter()
-                    .map(|b| ascii::escape_default(*b))
-                    .flatten()
-                    .collect(),
-            )
-            .unwrap(),
-            Some(ContentType::UTF_16LE) => UTF_16LE
-                .decode(&line_buffer, DecoderTrap::Replace)
-                .map_err(|_| "Invalid UTF-16LE")?,
-            Some(ContentType::UTF_16BE) => UTF_16BE
-                .decode(&line_buffer, DecoderTrap::Replace)
-                .map_err(|_| "Invalid UTF-16BE")?,
-            _ => String::from_utf8_lossy(&line_buffer).to_string(),
         };
 
-        if self.config.show_nonprintable {
-            line = replace_nonprintable(&line, self.config.tab_width);
-        }
-
         let regions = {
             let highlighter = match self.highlighter {
                 Some(ref mut highlighter) => highlighter,