support Python 3.x in importpkg
authorHelmut Grohne <helmut@subdivi.de>
Thu, 28 Apr 2016 19:35:42 +0000 (21:35 +0200)
committerHelmut Grohne <helmut@subdivi.de>
Thu, 28 Apr 2016 19:35:42 +0000 (21:35 +0200)
In Python 2.x, TarInfo.name is a bytes object. In Python 3.x,
TarInfo.name always is a unicode object. To avoid importpkg crashing
with an exception, we direct the Python 3.x decoding to use
surrogateescapes. Thus decoding the name boils down to checking whether
it contains surrogates.

importpkg.py

index dac4bb1..e8cc2fa 100755 (executable)
@@ -42,9 +42,32 @@ def gifhash():
     hashobj.name = "gif_sha512"
     return hashobj
 
-def decompress_tar(filelike, extension):
-    filelike = decompress(filelike, extension.decode("ascii"))
-    return tarfile.open(fileobj=filelike, mode="r|")
+if sys.version_info.major >= 3:
+    def decompress_tar(filelike, extension):
+        filelike = decompress(filelike, extension.decode("ascii"))
+        return tarfile.open(fileobj=filelike, mode="r|")
+
+    def decodetarname(name):
+        """Decoded name of a tarinfo.
+        @raises UnicodeDecodeError:
+        """
+        try:
+            name.encode("utf8", "strict")
+        except UnicodeEncodeError as e:
+            if e.reason == "surrogates not allowed":
+                name.encode("utf8", "surrogateescape").decode("utf8", "strict")
+        return name
+else:
+    def decompress_tar(filelike, extension):
+        filelike = decompress(filelike, extension.decode("ascii"))
+        return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
+                            errors="surrogateescape")
+
+    def decodetarname(name):
+        """Decoded name of a tarinfo.
+        @raises UnicodeDecodeError:
+        """
+        return name.decode("utf8")
 
 class ProcessingFinished(Exception):
     pass
@@ -77,7 +100,7 @@ class ImportpkgExtractor(DebExtractor):
             tf = decompress_tar(filelike, name[8:])
             for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
                 try:
-                    name = name.decode("utf8")
+                    name = decodetarname(name)
                 except UnicodeDecodeError:
                     print("warning: skipping filename with encoding error")
                     continue # skip files with non-utf8 encoding for now