1 /* 2 * Copyright 2017 Google Inc. All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.turbine.zip; 18 19 import static java.nio.charset.StandardCharsets.UTF_8; 20 21 import com.google.common.io.ByteStreams; 22 import com.google.common.primitives.UnsignedInts; 23 import java.io.ByteArrayInputStream; 24 import java.io.Closeable; 25 import java.io.IOError; 26 import java.io.IOException; 27 import java.nio.ByteBuffer; 28 import java.nio.ByteOrder; 29 import java.nio.MappedByteBuffer; 30 import java.nio.channels.FileChannel; 31 import java.nio.channels.FileChannel.MapMode; 32 import java.nio.charset.CharacterCodingException; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.file.Path; 35 import java.nio.file.StandardOpenOption; 36 import java.util.Iterator; 37 import java.util.zip.Inflater; 38 import java.util.zip.InflaterInputStream; 39 import java.util.zip.ZipException; 40 41 /** 42 * A fast, minimal, and somewhat garbage zip implementation. This exists because graal <a 43 * href="http://mail.openjdk.java.net/pipermail/graal-dev/2017-August/005039.html">doesn't yet 44 * support</a> {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have 45 * the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile} 46 * instead. 47 * 48 * <p>If you're reading this because you're fixing a bug, sorry. 49 * 50 * <p>(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this 51 * implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I 52 * verified they're doing the same work.) This is likely largely due to ZipInputStream reading the 53 * entire file from the beginning to scan the local headers, whereas this implementation (and 54 * ZipFile) only read the central directory. Iterating over the entries (but not reading the data) 55 * is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so 56 * most time is spent just scanning entry names. And rt.jar is an interesting test case because 57 * every compilation has to read it, and it dominates the size of the classpath for small 58 * compilations. 59 * 60 * <p>Implementation notes: 61 * 62 * <ul> 63 * <li>Leading garbage may be supported, since the archive is read backwards using the central 64 * directory. Archives modified with zip -A may not be supported. Trailing garbage is not 65 * supported. 66 * <li>UTF-8 is the only supported encoding. 67 * <li>STORED and DEFLATE are the only supported compression methods. 68 * <li>zip64 extensible data sectors are not supported. 69 * <li>Zip files larger than Integer.MAX_VALUE bytes are not supported. 70 * <li>The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end 71 * header is present only if ENDTOT in EOCD header is 0xFFFF. 72 * </ul> 73 */ 74 public class Zip { 75 76 static final int ZIP64_ENDSIG = 0x06064b50; 77 78 static final int LOCHDR = 30; // LOC header size 79 static final int CENHDR = 46; // CEN header size 80 static final int ENDHDR = 22; // END header size 81 static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size 82 static final int ZIP64_ENDHDR = 56; // ZIP64 end header size 83 84 static final int ENDTOT = 10; // total number of entries 85 static final int ENDSIZ = 12; // central directory size in bytes 86 static final int ENDCOM = 20; // zip file comment length 87 88 static final int CENHOW = 10; // compression method 89 static final int CENLEN = 24; // uncompressed size 90 static final int CENSIZ = 20; // compressed size 91 static final int CENNAM = 28; // filename length 92 static final int CENEXT = 30; // extra field length 93 static final int CENCOM = 32; // comment length 94 static final int CENOFF = 42; // LOC header offset 95 96 static final int LOCEXT = 28; // extra field length 97 98 static final int ZIP64_ENDSIZ = 40; // central directory size in bytes 99 100 static final int ZIP64_MAGICCOUNT = 0xFFFF; 101 102 /** Iterates over a zip archive. */ 103 static class ZipIterator implements Iterator<Entry> { 104 105 /** A reader for the backing storage. */ 106 private final FileChannel chan; 107 108 private final Path path; 109 private int cdindex = 0; 110 private final MappedByteBuffer cd; 111 private final CharsetDecoder decoder = UTF_8.newDecoder(); 112 ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd)113 ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) { 114 this.path = path; 115 this.chan = chan; 116 this.cd = cd; 117 } 118 119 @Override hasNext()120 public boolean hasNext() { 121 return cdindex < cd.limit(); 122 } 123 124 /* Returns a {@link Entry} for the current CEN entry. */ 125 @Override next()126 public Entry next() { 127 // TODO(cushon): technically we're supposed to throw NSEE 128 checkSignature(path, cd, cdindex, 1, 2, "CENSIG"); 129 int nameLength = cd.getChar(cdindex + CENNAM); 130 int extLength = cd.getChar(cdindex + CENEXT); 131 int commentLength = cd.getChar(cdindex + CENCOM); 132 Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex); 133 cdindex += CENHDR + nameLength + extLength + commentLength; 134 return entry; 135 } 136 string(ByteBuffer buf, int offset, int length)137 public String string(ByteBuffer buf, int offset, int length) { 138 buf = buf.duplicate(); 139 buf.position(offset); 140 buf.limit(offset + length); 141 decoder.reset(); 142 try { 143 return decoder.decode(buf).toString(); 144 } catch (CharacterCodingException e) { 145 throw new IOError(e); 146 } 147 } 148 } 149 150 /** Provides an {@link Iterable} of {@link Entry} over a zip archive. */ 151 public static class ZipIterable implements Iterable<Entry>, Closeable { 152 153 private final Path path; 154 private final FileChannel chan; 155 private final MappedByteBuffer cd; 156 ZipIterable(Path path)157 public ZipIterable(Path path) throws IOException { 158 this.path = path; 159 this.chan = FileChannel.open(path, StandardOpenOption.READ); 160 // Locate the EOCD 161 long size = chan.size(); 162 if (size < ENDHDR) { 163 throw new ZipException("invalid zip archive"); 164 } 165 long eocdOffset = size - ENDHDR; 166 MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR); 167 eocd.order(ByteOrder.LITTLE_ENDIAN); 168 int index = 0; 169 int commentSize = 0; 170 if (!isSignature(eocd, 0, 5, 6)) { 171 // The archive may contain a zip file comment; keep looking for the EOCD. 172 long start = Math.max(0, size - ENDHDR - 0xFFFF); 173 eocd = chan.map(MapMode.READ_ONLY, start, (size - start)); 174 eocd.order(ByteOrder.LITTLE_ENDIAN); 175 index = (int) ((size - start) - ENDHDR); 176 while (index > 0) { 177 index--; 178 eocd.position(index); 179 if (isSignature(eocd, index, 5, 6)) { 180 commentSize = (int) ((size - start) - ENDHDR) - index; 181 eocdOffset = start + index; 182 break; 183 } 184 } 185 } 186 checkSignature(path, eocd, index, 5, 6, "ENDSIG"); 187 int totalEntries = eocd.getChar(index + ENDTOT); 188 long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ)); 189 int actualCommentSize = eocd.getChar(index + ENDCOM); 190 if (commentSize != actualCommentSize) { 191 throw new ZipException( 192 String.format( 193 "zip file comment length was %d, expected %d", commentSize, actualCommentSize)); 194 } 195 // If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator. 196 if (totalEntries == ZIP64_MAGICCOUNT) { 197 // Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors. 198 long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR; 199 MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, zip64eocdOffset, ZIP64_ENDHDR); 200 zip64eocd.order(ByteOrder.LITTLE_ENDIAN); 201 // Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF 202 // entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations 203 // read the full EOCD records and compare them. 204 if (zip64eocd.getInt(0) == ZIP64_ENDSIG) { 205 cdsize = zip64eocd.getLong(ZIP64_ENDSIZ); 206 eocdOffset = zip64eocdOffset; 207 } 208 } 209 this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize); 210 cd.order(ByteOrder.LITTLE_ENDIAN); 211 } 212 213 @Override iterator()214 public Iterator<Entry> iterator() { 215 return new ZipIterator(path, chan, cd); 216 } 217 218 @Override close()219 public void close() throws IOException { 220 chan.close(); 221 } 222 } 223 224 /** An entry in a zip archive. */ 225 public static class Entry { 226 227 private final Path path; 228 private final FileChannel chan; 229 private final String name; 230 private final ByteBuffer cd; 231 private final int cdindex; 232 Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex)233 public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) { 234 this.path = path; 235 this.chan = chan; 236 this.name = name; 237 this.cd = cd; 238 this.cdindex = cdindex; 239 } 240 241 /** The entry name. */ name()242 public String name() { 243 return name; 244 } 245 246 /** The entry data. */ data()247 public byte[] data() { 248 // Read the offset and variable lengths from the central directory and then try to map in the 249 // data section in one shot. 250 long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF)); 251 int nameLength = cd.getChar(cdindex + CENNAM); 252 int extLength = cd.getChar(cdindex + CENEXT); 253 int compression = cd.getChar(cdindex + CENHOW); 254 switch (compression) { 255 case 0x8: 256 return getBytes( 257 offset, 258 nameLength, 259 extLength, 260 UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)), 261 /*deflate=*/ true); 262 case 0x0: 263 return getBytes( 264 offset, 265 nameLength, 266 extLength, 267 UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)), 268 /*deflate=*/ false); 269 default: 270 throw new AssertionError( 271 String.format("unsupported compression mode: 0x%x", compression)); 272 } 273 } 274 275 /** 276 * Number of extra bytes to read for each file, to avoid re-mapping the data if the local header 277 * reports more extra field data than the central directory. 278 */ 279 static final int EXTRA_FIELD_SLACK = 128; 280 getBytes( long offset, int nameLength, int cenExtLength, long size, boolean deflate)281 private byte[] getBytes( 282 long offset, int nameLength, int cenExtLength, long size, boolean deflate) { 283 if (size > Integer.MAX_VALUE) { 284 throw new IllegalArgumentException("unsupported zip entry size: " + size); 285 } 286 try { 287 MappedByteBuffer fc = 288 chan.map( 289 MapMode.READ_ONLY, 290 offset, 291 Math.min( 292 LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK, 293 chan.size() - offset)); 294 fc.order(ByteOrder.LITTLE_ENDIAN); 295 checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG"); 296 int locExtLength = fc.getChar(LOCEXT); 297 if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) { 298 // If the local header's extra fields don't match the central directory and we didn't 299 // leave enough slac, re-map the data section with the correct extra field length. 300 fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size); 301 fc.order(ByteOrder.LITTLE_ENDIAN); 302 } else { 303 // Otherwise seek past the local header, name, and extra fields to the data. 304 fc.position(LOCHDR + nameLength + locExtLength); 305 fc.limit((int) (LOCHDR + nameLength + locExtLength + size)); 306 } 307 byte[] bytes = new byte[(int) size]; 308 fc.get(bytes); 309 if (deflate) { 310 bytes = 311 ByteStreams.toByteArray( 312 new InflaterInputStream( 313 new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true))); 314 } 315 return bytes; 316 } catch (IOException e) { 317 throw new IOError(e); 318 } 319 } 320 } 321 checkSignature( Path path, MappedByteBuffer buf, int index, int i, int j, String name)322 static void checkSignature( 323 Path path, MappedByteBuffer buf, int index, int i, int j, String name) { 324 if (!isSignature(buf, index, i, j)) { 325 throw new AssertionError( 326 String.format( 327 "%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)", 328 path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index))); 329 } 330 } 331 isSignature(MappedByteBuffer buf, int index, int i, int j)332 static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) { 333 return (buf.get(index) == 'P') 334 && (buf.get(index + 1) == 'K') 335 && (buf.get(index + 2) == i) 336 && (buf.get(index + 3) == j); 337 } 338 } 339