From 7e79eeb9cb81b93dc7a098d294c2ecbee7f69bde Mon Sep 17 00:00:00 2001
From: Quentin Perret <quentin.perret@arm.com>
Date: Fri, 25 Jan 2019 13:25:05 +0000
Subject: [PATCH] target: Robustify read_tree_values()

target.read_tree_values() has several weaknesses. It doesn't support
files with ':' in their name, and it fails when reading binary files.
In essence, these limitations are cause by its fragile implementation
based on grep in shutils.

In order to robustify read_tree_values(), use tar and base64 to send the
content of a tree to the host, which can then process it from there. In
the process, read_tree_values() gains two new arguments:
 - decode_unicode: must be set to work text/utf-8 content;
 - strip_null_chars: must be set to remove '\00' chars from text files.

Both are set to true by default to keep backward compatibility with the
existing code.

Suggested-by: Douglas Raillard <douglas.raillard@arm.com>
Signed-off-by: Quentin Perret <quentin.perret@arm.com>
---
 devlib/bin/scripts/shutils.in | 32 ++++++++++--------
 devlib/target.py              | 63 +++++++++++++++++++++++++++++------
 2 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/devlib/bin/scripts/shutils.in b/devlib/bin/scripts/shutils.in
index 35213ef..37991a7 100755
--- a/devlib/bin/scripts/shutils.in
+++ b/devlib/bin/scripts/shutils.in
@@ -255,26 +255,32 @@ sched_get_kernel_attributes() {
 # Misc
 ################################################################################
 
-read_tree_values() {
+read_tree_tgz_b64() {
     BASEPATH=$1
     MAXDEPTH=$2
+    TMPBASE=$3
 
     if [ ! -e $BASEPATH ]; then
         echo "ERROR: $BASEPATH does not exist"
         exit 1
     fi
 
-    PATHS=$($BUSYBOX find $BASEPATH -follow -maxdepth $MAXDEPTH)
-    i=0
-    for path in $PATHS; do
-        i=$(expr $i + 1)
-        if [ $i -gt 1 ]; then
-            break;
-        fi
+    cd $TMPBASE
+    TMP_FOLDER=$($BUSYBOX realpath $($BUSYBOX mktemp -d XXXXXX))
+
+    # 'tar' doesn't work as expected on debugfs, so copy the tree first to
+    # workaround the issue
+    cd $BASEPATH
+    for CUR_FILE in $($BUSYBOX find . -follow -type f -maxdepth $MAXDEPTH); do
+        $BUSYBOX cp --parents $CUR_FILE $TMP_FOLDER/ 2> /dev/null
     done
-    if [ $i -gt 1 ]; then
-        $BUSYBOX grep -s '' $PATHS
-    fi
+
+    cd $TMP_FOLDER
+    $BUSYBOX tar cz * | $BUSYBOX base64
+
+    # Clean-up the tmp folder since we won't need it any more
+    cd $TMPBASE
+    rm -rf $TMP_FOLDER
 }
 
 get_linux_system_id() {
@@ -347,8 +353,8 @@ ftrace_get_function_stats)
 hotplug_online_all)
 	hotplug_online_all
     ;;
-read_tree_values)
-	read_tree_values $*
+read_tree_tgz_b64)
+	read_tree_tgz_b64 $*
     ;;
 get_linux_system_id)
 	get_linux_system_id $*
diff --git a/devlib/target.py b/devlib/target.py
index 475c166..e210693 100644
--- a/devlib/target.py
+++ b/devlib/target.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 #
 
+import io
+import base64
+import gzip
 import os
 import re
 import time
@@ -684,23 +687,61 @@ class Target(object):
         timeout = duration + 10
         self.execute('sleep {}'.format(duration), timeout=timeout)
 
-    def read_tree_values_flat(self, path, depth=1, check_exit_code=True):
-        command = 'read_tree_values {} {}'.format(quote(path), depth)
+    def read_tree_values_flat(self, path, depth=1, check_exit_code=True,
+                              decode_unicode=True, strip_null_chars=True):
+        command = 'read_tree_tgz_b64 {} {} {}'.format(quote(path), depth,
+                                                  quote(self.working_directory))
         output = self._execute_util(command, as_root=self.is_rooted,
                                     check_exit_code=check_exit_code)
 
-        accumulator = defaultdict(list)
-        for entry in output.strip().split('\n'):
-            if ':' not in entry:
-                continue
-            path, value = entry.strip().split(':', 1)
-            accumulator[path].append(value)
+        result = {}
+
+        # Unpack the archive in memory
+        tar_gz = base64.b64decode(output)
+        tar_gz_bytes = io.BytesIO(tar_gz)
+        tar_buf = gzip.GzipFile(fileobj=tar_gz_bytes).read()
+        tar_bytes = io.BytesIO(tar_buf)
+        with tarfile.open(fileobj=tar_bytes) as tar:
+            for member in tar.getmembers():
+                try:
+                    content_f = tar.extractfile(member)
+                # ignore exotic members like sockets
+                except Exception:
+                    continue
+                # if it is a file and not a folder
+                if content_f:
+                    content = content_f.read()
+                    if decode_unicode:
+                        try:
+                            content = content.decode('utf-8').strip()
+                            if strip_null_chars:
+                                content = content.replace('\x00', '').strip()
+                        except UnicodeDecodeError:
+                            content = ''
+
+                    name = self.path.join(path, member.name)
+                    result[name] = content
 
-        result = {k: '\n'.join(v).strip() for k, v in accumulator.items()}
         return result
 
-    def read_tree_values(self, path, depth=1, dictcls=dict, check_exit_code=True):
-        value_map = self.read_tree_values_flat(path, depth, check_exit_code)
+    def read_tree_values(self, path, depth=1, dictcls=dict,
+                         check_exit_code=True, decode_unicode=True,
+                         strip_null_chars=True):
+        """
+        Reads the content of all files under a given tree
+
+        :path: path to the tree
+        :depth: maximum tree depth to read
+        :dictcls: type of the dict used to store the results
+        :check_exit_code: raise an exception if the shutil command fails
+        :decode_unicode: decode the content of files as utf-8
+        :strip_null_chars: remove '\x00' chars from the content of utf-8
+                           decoded files
+
+        :returns: a tree-like dict with the content of files as leafs
+        """
+        value_map = self.read_tree_values_flat(path, depth, check_exit_code,
+                                               decode_unicode, strip_null_chars)
         return _build_path_tree(value_map, path, self.path.sep, dictcls)
 
     # internal methods