1 /*
2  * Copyright 2017 Google Inc. All Rights Reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.turbine.zip;
18 
19 import static java.nio.charset.StandardCharsets.UTF_8;
20 
21 import com.google.common.io.ByteStreams;
22 import com.google.common.primitives.UnsignedInts;
23 import java.io.ByteArrayInputStream;
24 import java.io.Closeable;
25 import java.io.IOError;
26 import java.io.IOException;
27 import java.nio.ByteBuffer;
28 import java.nio.ByteOrder;
29 import java.nio.MappedByteBuffer;
30 import java.nio.channels.FileChannel;
31 import java.nio.channels.FileChannel.MapMode;
32 import java.nio.charset.CharacterCodingException;
33 import java.nio.charset.CharsetDecoder;
34 import java.nio.file.Path;
35 import java.nio.file.StandardOpenOption;
36 import java.util.Iterator;
37 import java.util.zip.Inflater;
38 import java.util.zip.InflaterInputStream;
39 import java.util.zip.ZipException;
40 
41 /**
42  * A fast, minimal, and somewhat garbage zip implementation. This exists because graal <a
43  * href="http://mail.openjdk.java.net/pipermail/graal-dev/2017-August/005039.html">doesn't yet
44  * support</a> {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have
45  * the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile}
46  * instead.
47  *
48  * <p>If you're reading this because you're fixing a bug, sorry.
49  *
50  * <p>(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this
51  * implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I
52  * verified they're doing the same work.) This is likely largely due to ZipInputStream reading the
53  * entire file from the beginning to scan the local headers, whereas this implementation (and
54  * ZipFile) only read the central directory. Iterating over the entries (but not reading the data)
55  * is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so
56  * most time is spent just scanning entry names. And rt.jar is an interesting test case because
57  * every compilation has to read it, and it dominates the size of the classpath for small
58  * compilations.
59  *
60  * <p>Implementation notes:
61  *
62  * <ul>
63  *   <li>Leading garbage may be supported, since the archive is read backwards using the central
64  *       directory. Archives modified with zip -A may not be supported. Trailing garbage is not
65  *       supported.
66  *   <li>UTF-8 is the only supported encoding.
67  *   <li>STORED and DEFLATE are the only supported compression methods.
68  *   <li>zip64 extensible data sectors are not supported.
69  *   <li>Zip files larger than Integer.MAX_VALUE bytes are not supported.
70  *   <li>The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end
71  *       header is present only if ENDTOT in EOCD header is 0xFFFF.
72  * </ul>
73  */
74 public class Zip {
75 
76   static final int ZIP64_ENDSIG = 0x06064b50;
77 
78   static final int LOCHDR = 30; // LOC header size
79   static final int CENHDR = 46; // CEN header size
80   static final int ENDHDR = 22; // END header size
81   static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size
82   static final int ZIP64_ENDHDR = 56; // ZIP64 end header size
83 
84   static final int ENDTOT = 10; // total number of entries
85   static final int ENDSIZ = 12; // central directory size in bytes
86   static final int ENDCOM = 20; // zip file comment length
87 
88   static final int CENHOW = 10; // compression method
89   static final int CENLEN = 24; // uncompressed size
90   static final int CENSIZ = 20; // compressed size
91   static final int CENNAM = 28; // filename length
92   static final int CENEXT = 30; // extra field length
93   static final int CENCOM = 32; // comment length
94   static final int CENOFF = 42; // LOC header offset
95 
96   static final int LOCEXT = 28; // extra field length
97 
98   static final int ZIP64_ENDSIZ = 40; // central directory size in bytes
99 
100   static final int ZIP64_MAGICCOUNT = 0xFFFF;
101 
102   /** Iterates over a zip archive. */
103   static class ZipIterator implements Iterator<Entry> {
104 
105     /** A reader for the backing storage. */
106     private final FileChannel chan;
107 
108     private final Path path;
109     private int cdindex = 0;
110     private final MappedByteBuffer cd;
111     private final CharsetDecoder decoder = UTF_8.newDecoder();
112 
ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd)113     ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) {
114       this.path = path;
115       this.chan = chan;
116       this.cd = cd;
117     }
118 
119     @Override
hasNext()120     public boolean hasNext() {
121       return cdindex < cd.limit();
122     }
123 
124     /* Returns a {@link Entry} for the current CEN entry. */
125     @Override
next()126     public Entry next() {
127       // TODO(cushon): technically we're supposed to throw NSEE
128       checkSignature(path, cd, cdindex, 1, 2, "CENSIG");
129       int nameLength = cd.getChar(cdindex + CENNAM);
130       int extLength = cd.getChar(cdindex + CENEXT);
131       int commentLength = cd.getChar(cdindex + CENCOM);
132       Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex);
133       cdindex += CENHDR + nameLength + extLength + commentLength;
134       return entry;
135     }
136 
string(ByteBuffer buf, int offset, int length)137     public String string(ByteBuffer buf, int offset, int length) {
138       buf = buf.duplicate();
139       buf.position(offset);
140       buf.limit(offset + length);
141       decoder.reset();
142       try {
143         return decoder.decode(buf).toString();
144       } catch (CharacterCodingException e) {
145         throw new IOError(e);
146       }
147     }
148   }
149 
150   /** Provides an {@link Iterable} of {@link Entry} over a zip archive. */
151   public static class ZipIterable implements Iterable<Entry>, Closeable {
152 
153     private final Path path;
154     private final FileChannel chan;
155     private final MappedByteBuffer cd;
156 
ZipIterable(Path path)157     public ZipIterable(Path path) throws IOException {
158       this.path = path;
159       this.chan = FileChannel.open(path, StandardOpenOption.READ);
160       // Locate the EOCD
161       long size = chan.size();
162       if (size < ENDHDR) {
163         throw new ZipException("invalid zip archive");
164       }
165       long eocdOffset = size - ENDHDR;
166       MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR);
167       eocd.order(ByteOrder.LITTLE_ENDIAN);
168       int index = 0;
169       int commentSize = 0;
170       if (!isSignature(eocd, 0, 5, 6)) {
171         // The archive may contain a zip file comment; keep looking for the EOCD.
172         long start = Math.max(0, size - ENDHDR - 0xFFFF);
173         eocd = chan.map(MapMode.READ_ONLY, start, (size - start));
174         eocd.order(ByteOrder.LITTLE_ENDIAN);
175         index = (int) ((size - start) - ENDHDR);
176         while (index > 0) {
177           index--;
178           eocd.position(index);
179           if (isSignature(eocd, index, 5, 6)) {
180             commentSize = (int) ((size - start) - ENDHDR) - index;
181             eocdOffset = start + index;
182             break;
183           }
184         }
185       }
186       checkSignature(path, eocd, index, 5, 6, "ENDSIG");
187       int totalEntries = eocd.getChar(index + ENDTOT);
188       long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ));
189       int actualCommentSize = eocd.getChar(index + ENDCOM);
190       if (commentSize != actualCommentSize) {
191         throw new ZipException(
192             String.format(
193                 "zip file comment length was %d, expected %d", commentSize, actualCommentSize));
194       }
195       // If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator.
196       if (totalEntries == ZIP64_MAGICCOUNT) {
197         // Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors.
198         long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR;
199         MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, zip64eocdOffset, ZIP64_ENDHDR);
200         zip64eocd.order(ByteOrder.LITTLE_ENDIAN);
201         // Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF
202         // entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations
203         // read the full EOCD records and compare them.
204         if (zip64eocd.getInt(0) == ZIP64_ENDSIG) {
205           cdsize = zip64eocd.getLong(ZIP64_ENDSIZ);
206           eocdOffset = zip64eocdOffset;
207         }
208       }
209       this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize);
210       cd.order(ByteOrder.LITTLE_ENDIAN);
211     }
212 
213     @Override
iterator()214     public Iterator<Entry> iterator() {
215       return new ZipIterator(path, chan, cd);
216     }
217 
218     @Override
close()219     public void close() throws IOException {
220       chan.close();
221     }
222   }
223 
224   /** An entry in a zip archive. */
225   public static class Entry {
226 
227     private final Path path;
228     private final FileChannel chan;
229     private final String name;
230     private final ByteBuffer cd;
231     private final int cdindex;
232 
Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex)233     public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) {
234       this.path = path;
235       this.chan = chan;
236       this.name = name;
237       this.cd = cd;
238       this.cdindex = cdindex;
239     }
240 
241     /** The entry name. */
name()242     public String name() {
243       return name;
244     }
245 
246     /** The entry data. */
data()247     public byte[] data() {
248       // Read the offset and variable lengths from the central directory and then try to map in the
249       // data section in one shot.
250       long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF));
251       int nameLength = cd.getChar(cdindex + CENNAM);
252       int extLength = cd.getChar(cdindex + CENEXT);
253       int compression = cd.getChar(cdindex + CENHOW);
254       switch (compression) {
255         case 0x8:
256           return getBytes(
257               offset,
258               nameLength,
259               extLength,
260               UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)),
261               /*deflate=*/ true);
262         case 0x0:
263           return getBytes(
264               offset,
265               nameLength,
266               extLength,
267               UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)),
268               /*deflate=*/ false);
269         default:
270           throw new AssertionError(
271               String.format("unsupported compression mode: 0x%x", compression));
272       }
273     }
274 
275     /**
276      * Number of extra bytes to read for each file, to avoid re-mapping the data if the local header
277      * reports more extra field data than the central directory.
278      */
279     static final int EXTRA_FIELD_SLACK = 128;
280 
getBytes( long offset, int nameLength, int cenExtLength, long size, boolean deflate)281     private byte[] getBytes(
282         long offset, int nameLength, int cenExtLength, long size, boolean deflate) {
283       if (size > Integer.MAX_VALUE) {
284         throw new IllegalArgumentException("unsupported zip entry size: " + size);
285       }
286       try {
287         MappedByteBuffer fc =
288             chan.map(
289                 MapMode.READ_ONLY,
290                 offset,
291                 Math.min(
292                     LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK,
293                     chan.size() - offset));
294         fc.order(ByteOrder.LITTLE_ENDIAN);
295         checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG");
296         int locExtLength = fc.getChar(LOCEXT);
297         if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) {
298           // If the local header's extra fields don't match the central directory and we didn't
299           // leave enough slac, re-map the data section with the correct extra field length.
300           fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size);
301           fc.order(ByteOrder.LITTLE_ENDIAN);
302         } else {
303           // Otherwise seek past the local header, name, and extra fields to the data.
304           fc.position(LOCHDR + nameLength + locExtLength);
305           fc.limit((int) (LOCHDR + nameLength + locExtLength + size));
306         }
307         byte[] bytes = new byte[(int) size];
308         fc.get(bytes);
309         if (deflate) {
310           bytes =
311               ByteStreams.toByteArray(
312                   new InflaterInputStream(
313                       new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true)));
314         }
315         return bytes;
316       } catch (IOException e) {
317         throw new IOError(e);
318       }
319     }
320   }
321 
checkSignature( Path path, MappedByteBuffer buf, int index, int i, int j, String name)322   static void checkSignature(
323       Path path, MappedByteBuffer buf, int index, int i, int j, String name) {
324     if (!isSignature(buf, index, i, j)) {
325       throw new AssertionError(
326           String.format(
327               "%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)",
328               path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index)));
329     }
330   }
331 
isSignature(MappedByteBuffer buf, int index, int i, int j)332   static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) {
333     return (buf.get(index) == 'P')
334         && (buf.get(index + 1) == 'K')
335         && (buf.get(index + 2) == i)
336         && (buf.get(index + 3) == j);
337   }
338 }
339