1 /*
2  *  Licensed to the Apache Software Foundation (ASF) under one or more
3  *  contributor license agreements.  See the NOTICE file distributed with
4  *  this work for additional information regarding copyright ownership.
5  *  The ASF licenses this file to You under the Apache License, Version 2.0
6  *  (the "License"); you may not use this file except in compliance with
7  *  the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  */
18 
19 /*
20  * This package is based on the work done by Timothy Gerard Endres
21  * (time@ice.com) to whom the Ant project is very grateful for his great code.
22  */
23 
24 package org.apache.commons.compress.archivers.tar;
25 
26 import java.io.ByteArrayOutputStream;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.util.HashMap;
30 import java.util.Map;
31 
32 import org.apache.commons.compress.archivers.ArchiveEntry;
33 import org.apache.commons.compress.archivers.ArchiveInputStream;
34 import org.apache.commons.compress.archivers.zip.ZipEncoding;
35 import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
36 import org.apache.commons.compress.utils.ArchiveUtils;
37 import org.apache.commons.compress.utils.CharsetNames;
38 import org.apache.commons.compress.utils.IOUtils;
39 
40 /**
41  * The TarInputStream reads a UNIX tar archive as an InputStream.
42  * methods are provided to position at each successive entry in
43  * the archive, and the read each entry as a normal input stream
44  * using read().
45  * @NotThreadSafe
46  */
47 public class TarArchiveInputStream extends ArchiveInputStream {
48 
49     private static final int SMALL_BUFFER_SIZE = 256;
50 
51     private final byte[] smallBuf = new byte[SMALL_BUFFER_SIZE];
52 
53     /** The size the TAR header */
54     private final int recordSize;
55 
56     /** The size of a block */
57     private final int blockSize;
58 
59     /** True if file has hit EOF */
60     private boolean hasHitEOF;
61 
62     /** Size of the current entry */
63     private long entrySize;
64 
65     /** How far into the entry the stream is at */
66     private long entryOffset;
67 
68     /** An input stream to read from */
69     private final InputStream is;
70 
71     /** The meta-data about the current entry */
72     private TarArchiveEntry currEntry;
73 
74     /** The encoding of the file */
75     private final ZipEncoding zipEncoding;
76 
77     // the provided encoding (for unit tests)
78     final String encoding;
79 
80     // the global PAX header
81     private Map<String, String> globalPaxHeaders = new HashMap<>();
82 
83     /**
84      * Constructor for TarInputStream.
85      * @param is the input stream to use
86      */
TarArchiveInputStream(final InputStream is)87     public TarArchiveInputStream(final InputStream is) {
88         this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
89     }
90 
91     /**
92      * Constructor for TarInputStream.
93      * @param is the input stream to use
94      * @param encoding name of the encoding to use for file names
95      * @since 1.4
96      */
TarArchiveInputStream(final InputStream is, final String encoding)97     public TarArchiveInputStream(final InputStream is, final String encoding) {
98         this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE,
99              encoding);
100     }
101 
102     /**
103      * Constructor for TarInputStream.
104      * @param is the input stream to use
105      * @param blockSize the block size to use
106      */
TarArchiveInputStream(final InputStream is, final int blockSize)107     public TarArchiveInputStream(final InputStream is, final int blockSize) {
108         this(is, blockSize, TarConstants.DEFAULT_RCDSIZE);
109     }
110 
111     /**
112      * Constructor for TarInputStream.
113      * @param is the input stream to use
114      * @param blockSize the block size to use
115      * @param encoding name of the encoding to use for file names
116      * @since 1.4
117      */
TarArchiveInputStream(final InputStream is, final int blockSize, final String encoding)118     public TarArchiveInputStream(final InputStream is, final int blockSize,
119                                  final String encoding) {
120         this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
121     }
122 
123     /**
124      * Constructor for TarInputStream.
125      * @param is the input stream to use
126      * @param blockSize the block size to use
127      * @param recordSize the record size to use
128      */
TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize)129     public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize) {
130         this(is, blockSize, recordSize, null);
131     }
132 
133     /**
134      * Constructor for TarInputStream.
135      * @param is the input stream to use
136      * @param blockSize the block size to use
137      * @param recordSize the record size to use
138      * @param encoding name of the encoding to use for file names
139      * @since 1.4
140      */
TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize, final String encoding)141     public TarArchiveInputStream(final InputStream is, final int blockSize, final int recordSize,
142                                  final String encoding) {
143         this.is = is;
144         this.hasHitEOF = false;
145         this.encoding = encoding;
146         this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
147         this.recordSize = recordSize;
148         this.blockSize = blockSize;
149     }
150 
151     /**
152      * Closes this stream. Calls the TarBuffer's close() method.
153      * @throws IOException on error
154      */
155     @Override
close()156     public void close() throws IOException {
157         is.close();
158     }
159 
160     /**
161      * Get the record size being used by this stream's buffer.
162      *
163      * @return The TarBuffer record size.
164      */
getRecordSize()165     public int getRecordSize() {
166         return recordSize;
167     }
168 
169     /**
170      * Get the available data that can be read from the current
171      * entry in the archive. This does not indicate how much data
172      * is left in the entire archive, only in the current entry.
173      * This value is determined from the entry's size header field
174      * and the amount of data already read from the current entry.
175      * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
176      * bytes are left in the current entry in the archive.
177      *
178      * @return The number of available bytes for the current entry.
179      * @throws IOException for signature
180      */
181     @Override
available()182     public int available() throws IOException {
183         if (isDirectory()) {
184             return 0;
185         }
186         if (entrySize - entryOffset > Integer.MAX_VALUE) {
187             return Integer.MAX_VALUE;
188         }
189         return (int) (entrySize - entryOffset);
190     }
191 
192 
193     /**
194      * Skips over and discards <code>n</code> bytes of data from this input
195      * stream. The <code>skip</code> method may, for a variety of reasons, end
196      * up skipping over some smaller number of bytes, possibly <code>0</code>.
197      * This may result from any of a number of conditions; reaching end of file
198      * or end of entry before <code>n</code> bytes have been skipped; are only
199      * two possibilities. The actual number of bytes skipped is returned. If
200      * <code>n</code> is negative, no bytes are skipped.
201      *
202      *
203      * @param n
204      *            the number of bytes to be skipped.
205      * @return the actual number of bytes skipped.
206      * @throws IOException
207      *                if some other I/O error occurs.
208      */
209     @Override
skip(final long n)210     public long skip(final long n) throws IOException {
211         if (n <= 0 || isDirectory()) {
212             return 0;
213         }
214 
215         final long available = entrySize - entryOffset;
216         final long skipped = IOUtils.skip(is, Math.min(n, available));
217         count(skipped);
218         entryOffset += skipped;
219         return skipped;
220     }
221 
222     /**
223      * Since we do not support marking just yet, we return false.
224      *
225      * @return False.
226      */
227     @Override
markSupported()228     public boolean markSupported() {
229         return false;
230     }
231 
232     /**
233      * Since we do not support marking just yet, we do nothing.
234      *
235      * @param markLimit The limit to mark.
236      */
237     @Override
mark(final int markLimit)238     public void mark(final int markLimit) {
239     }
240 
241     /**
242      * Since we do not support marking just yet, we do nothing.
243      */
244     @Override
reset()245     public synchronized void reset() {
246     }
247 
248     /**
249      * Get the next entry in this tar archive. This will skip
250      * over any remaining data in the current entry, if there
251      * is one, and place the input stream at the header of the
252      * next entry, and read the header and instantiate a new
253      * TarEntry from the header bytes and return that entry.
254      * If there are no more entries in the archive, null will
255      * be returned to indicate that the end of the archive has
256      * been reached.
257      *
258      * @return The next TarEntry in the archive, or null.
259      * @throws IOException on error
260      */
getNextTarEntry()261     public TarArchiveEntry getNextTarEntry() throws IOException {
262         if (isAtEOF()) {
263             return null;
264         }
265 
266         if (currEntry != null) {
267             /* Skip will only go to the end of the current entry */
268             IOUtils.skip(this, Long.MAX_VALUE);
269 
270             /* skip to the end of the last record */
271             skipRecordPadding();
272         }
273 
274         final byte[] headerBuf = getRecord();
275 
276         if (headerBuf == null) {
277             /* hit EOF */
278             currEntry = null;
279             return null;
280         }
281 
282         try {
283             currEntry = new TarArchiveEntry(headerBuf, zipEncoding);
284         } catch (final IllegalArgumentException e) {
285             throw new IOException("Error detected parsing the header", e);
286         }
287 
288         entryOffset = 0;
289         entrySize = currEntry.getSize();
290 
291         if (currEntry.isGNULongLinkEntry()) {
292             final byte[] longLinkData = getLongNameData();
293             if (longLinkData == null) {
294                 // Bugzilla: 40334
295                 // Malformed tar file - long link entry name not followed by
296                 // entry
297                 return null;
298             }
299             currEntry.setLinkName(zipEncoding.decode(longLinkData));
300         }
301 
302         if (currEntry.isGNULongNameEntry()) {
303             final byte[] longNameData = getLongNameData();
304             if (longNameData == null) {
305                 // Bugzilla: 40334
306                 // Malformed tar file - long entry name not followed by
307                 // entry
308                 return null;
309             }
310             currEntry.setName(zipEncoding.decode(longNameData));
311         }
312 
313         if (currEntry.isGlobalPaxHeader()){ // Process Global Pax headers
314             readGlobalPaxHeaders();
315         }
316 
317         if (currEntry.isPaxHeader()){ // Process Pax headers
318             paxHeaders();
319         } else if (!globalPaxHeaders.isEmpty()) {
320             applyPaxHeadersToCurrentEntry(globalPaxHeaders);
321         }
322 
323         if (currEntry.isOldGNUSparse()){ // Process sparse files
324             readOldGNUSparse();
325         }
326 
327         // If the size of the next element in the archive has changed
328         // due to a new size being reported in the posix header
329         // information, we update entrySize here so that it contains
330         // the correct value.
331         entrySize = currEntry.getSize();
332 
333         return currEntry;
334     }
335 
336     /**
337      * The last record block should be written at the full size, so skip any
338      * additional space used to fill a record after an entry
339      */
skipRecordPadding()340     private void skipRecordPadding() throws IOException {
341         if (!isDirectory() && this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
342             final long numRecords = (this.entrySize / this.recordSize) + 1;
343             final long padding = (numRecords * this.recordSize) - this.entrySize;
344             final long skipped = IOUtils.skip(is, padding);
345             count(skipped);
346         }
347     }
348 
349     /**
350      * Get the next entry in this tar archive as longname data.
351      *
352      * @return The next entry in the archive as longname data, or null.
353      * @throws IOException on error
354      */
getLongNameData()355     protected byte[] getLongNameData() throws IOException {
356         // read in the name
357         final ByteArrayOutputStream longName = new ByteArrayOutputStream();
358         int length = 0;
359         while ((length = read(smallBuf)) >= 0) {
360             longName.write(smallBuf, 0, length);
361         }
362         getNextEntry();
363         if (currEntry == null) {
364             // Bugzilla: 40334
365             // Malformed tar file - long entry name not followed by entry
366             return null;
367         }
368         byte[] longNameData = longName.toByteArray();
369         // remove trailing null terminator(s)
370         length = longNameData.length;
371         while (length > 0 && longNameData[length - 1] == 0) {
372             --length;
373         }
374         if (length != longNameData.length) {
375             final byte[] l = new byte[length];
376             System.arraycopy(longNameData, 0, l, 0, length);
377             longNameData = l;
378         }
379         return longNameData;
380     }
381 
382     /**
383      * Get the next record in this tar archive. This will skip
384      * over any remaining data in the current entry, if there
385      * is one, and place the input stream at the header of the
386      * next entry.
387      *
388      * <p>If there are no more entries in the archive, null will be
389      * returned to indicate that the end of the archive has been
390      * reached.  At the same time the {@code hasHitEOF} marker will be
391      * set to true.</p>
392      *
393      * @return The next header in the archive, or null.
394      * @throws IOException on error
395      */
getRecord()396     private byte[] getRecord() throws IOException {
397         byte[] headerBuf = readRecord();
398         setAtEOF(isEOFRecord(headerBuf));
399         if (isAtEOF() && headerBuf != null) {
400             tryToConsumeSecondEOFRecord();
401             consumeRemainderOfLastBlock();
402             headerBuf = null;
403         }
404         return headerBuf;
405     }
406 
407     /**
408      * Determine if an archive record indicate End of Archive. End of
409      * archive is indicated by a record that consists entirely of null bytes.
410      *
411      * @param record The record data to check.
412      * @return true if the record data is an End of Archive
413      */
isEOFRecord(final byte[] record)414     protected boolean isEOFRecord(final byte[] record) {
415         return record == null || ArchiveUtils.isArrayZero(record, recordSize);
416     }
417 
418     /**
419      * Read a record from the input stream and return the data.
420      *
421      * @return The record data or null if EOF has been hit.
422      * @throws IOException on error
423      */
readRecord()424     protected byte[] readRecord() throws IOException {
425 
426         final byte[] record = new byte[recordSize];
427 
428         final int readNow = IOUtils.readFully(is, record);
429         count(readNow);
430         if (readNow != recordSize) {
431             return null;
432         }
433 
434         return record;
435     }
436 
readGlobalPaxHeaders()437     private void readGlobalPaxHeaders() throws IOException {
438         globalPaxHeaders = parsePaxHeaders(this);
439         getNextEntry(); // Get the actual file entry
440     }
441 
paxHeaders()442     private void paxHeaders() throws IOException{
443         final Map<String, String> headers = parsePaxHeaders(this);
444         getNextEntry(); // Get the actual file entry
445         applyPaxHeadersToCurrentEntry(headers);
446     }
447 
448     // NOTE, using a Map here makes it impossible to ever support GNU
449     // sparse files using the PAX Format 0.0, see
450     // https://www.gnu.org/software/tar/manual/html_section/tar_92.html#SEC188
parsePaxHeaders(final InputStream i)451     Map<String, String> parsePaxHeaders(final InputStream i)
452         throws IOException {
453         final Map<String, String> headers = new HashMap<>(globalPaxHeaders);
454         // Format is "length keyword=value\n";
455         while(true){ // get length
456             int ch;
457             int len = 0;
458             int read = 0;
459             while((ch = i.read()) != -1) {
460                 read++;
461                 if (ch == '\n') { // blank line in header
462                     break;
463                 } else if (ch == ' '){ // End of length string
464                     // Get keyword
465                     final ByteArrayOutputStream coll = new ByteArrayOutputStream();
466                     while((ch = i.read()) != -1) {
467                         read++;
468                         if (ch == '='){ // end of keyword
469                             final String keyword = coll.toString(CharsetNames.UTF_8);
470                             // Get rest of entry
471                             final int restLen = len - read;
472                             if (restLen == 1) { // only NL
473                                 headers.remove(keyword);
474                             } else {
475                                 final byte[] rest = new byte[restLen];
476                                 final int got = IOUtils.readFully(i, rest);
477                                 if (got != restLen) {
478                                     throw new IOException("Failed to read "
479                                                           + "Paxheader. Expected "
480                                                           + restLen
481                                                           + " bytes, read "
482                                                           + got);
483                                 }
484                                 // Drop trailing NL
485                                 final String value = new String(rest, 0,
486                                                           restLen - 1, CharsetNames.UTF_8);
487                                 headers.put(keyword, value);
488                             }
489                             break;
490                         }
491                         coll.write((byte) ch);
492                     }
493                     break; // Processed single header
494                 }
495                 len *= 10;
496                 len += ch - '0';
497             }
498             if (ch == -1){ // EOF
499                 break;
500             }
501         }
502         return headers;
503     }
504 
applyPaxHeadersToCurrentEntry(final Map<String, String> headers)505     private void applyPaxHeadersToCurrentEntry(final Map<String, String> headers) {
506         currEntry.updateEntryFromPaxHeaders(headers);
507 
508     }
509 
510     /**
511      * Adds the sparse chunks from the current entry to the sparse chunks,
512      * including any additional sparse entries following the current entry.
513      *
514      * @throws IOException on error
515      *
516      * @todo Sparse files get not yet really processed.
517      */
readOldGNUSparse()518     private void readOldGNUSparse() throws IOException {
519         /* we do not really process sparse files yet
520         sparses = new ArrayList();
521         sparses.addAll(currEntry.getSparses());
522         */
523         if (currEntry.isExtended()) {
524             TarArchiveSparseEntry entry;
525             do {
526                 final byte[] headerBuf = getRecord();
527                 if (headerBuf == null) {
528                     currEntry = null;
529                     break;
530                 }
531                 entry = new TarArchiveSparseEntry(headerBuf);
532                 /* we do not really process sparse files yet
533                 sparses.addAll(entry.getSparses());
534                 */
535             } while (entry.isExtended());
536         }
537     }
538 
isDirectory()539     private boolean isDirectory() {
540         return currEntry != null && currEntry.isDirectory();
541     }
542 
543     /**
544      * Returns the next Archive Entry in this Stream.
545      *
546      * @return the next entry,
547      *         or {@code null} if there are no more entries
548      * @throws IOException if the next entry could not be read
549      */
550     @Override
getNextEntry()551     public ArchiveEntry getNextEntry() throws IOException {
552         return getNextTarEntry();
553     }
554 
555     /**
556      * Tries to read the next record rewinding the stream if it is not a EOF record.
557      *
558      * <p>This is meant to protect against cases where a tar
559      * implementation has written only one EOF record when two are
560      * expected.  Actually this won't help since a non-conforming
561      * implementation likely won't fill full blocks consisting of - by
562      * default - ten records either so we probably have already read
563      * beyond the archive anyway.</p>
564      */
tryToConsumeSecondEOFRecord()565     private void tryToConsumeSecondEOFRecord() throws IOException {
566         boolean shouldReset = true;
567         final boolean marked = is.markSupported();
568         if (marked) {
569             is.mark(recordSize);
570         }
571         try {
572             shouldReset = !isEOFRecord(readRecord());
573         } finally {
574             if (shouldReset && marked) {
575                 pushedBackBytes(recordSize);
576             	is.reset();
577             }
578         }
579     }
580 
581     /**
582      * Reads bytes from the current tar archive entry.
583      *
584      * This method is aware of the boundaries of the current
585      * entry in the archive and will deal with them as if they
586      * were this stream's start and EOF.
587      *
588      * @param buf The buffer into which to place bytes read.
589      * @param offset The offset at which to place bytes read.
590      * @param numToRead The number of bytes to read.
591      * @return The number of bytes read, or -1 at EOF.
592      * @throws IOException on error
593      */
594     @Override
read(final byte[] buf, final int offset, int numToRead)595     public int read(final byte[] buf, final int offset, int numToRead) throws IOException {
596     	int totalRead = 0;
597 
598         if (isAtEOF() || isDirectory() || entryOffset >= entrySize) {
599             return -1;
600         }
601 
602         if (currEntry == null) {
603             throw new IllegalStateException("No current tar entry");
604         }
605 
606         numToRead = Math.min(numToRead, available());
607 
608         totalRead = is.read(buf, offset, numToRead);
609 
610         if (totalRead == -1) {
611             if (numToRead > 0) {
612                 throw new IOException("Truncated TAR archive");
613             }
614             setAtEOF(true);
615         } else {
616             count(totalRead);
617             entryOffset += totalRead;
618         }
619 
620         return totalRead;
621     }
622 
623     /**
624      * Whether this class is able to read the given entry.
625      *
626      * <p>May return false if the current entry is a sparse file.</p>
627      */
628     @Override
canReadEntryData(final ArchiveEntry ae)629     public boolean canReadEntryData(final ArchiveEntry ae) {
630         if (ae instanceof TarArchiveEntry) {
631             final TarArchiveEntry te = (TarArchiveEntry) ae;
632             return !te.isSparse();
633         }
634         return false;
635     }
636 
637     /**
638      * Get the current TAR Archive Entry that this input stream is processing
639      *
640      * @return The current Archive Entry
641      */
getCurrentEntry()642     public TarArchiveEntry getCurrentEntry() {
643         return currEntry;
644     }
645 
setCurrentEntry(final TarArchiveEntry e)646     protected final void setCurrentEntry(final TarArchiveEntry e) {
647         currEntry = e;
648     }
649 
isAtEOF()650     protected final boolean isAtEOF() {
651         return hasHitEOF;
652     }
653 
setAtEOF(final boolean b)654     protected final void setAtEOF(final boolean b) {
655         hasHitEOF = b;
656     }
657 
658     /**
659      * This method is invoked once the end of the archive is hit, it
660      * tries to consume the remaining bytes under the assumption that
661      * the tool creating this archive has padded the last block.
662      */
consumeRemainderOfLastBlock()663     private void consumeRemainderOfLastBlock() throws IOException {
664         final long bytesReadOfLastBlock = getBytesRead() % blockSize;
665         if (bytesReadOfLastBlock > 0) {
666             final long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock);
667             count(skipped);
668         }
669     }
670 
671     /**
672      * Checks if the signature matches what is expected for a tar file.
673      *
674      * @param signature
675      *            the bytes to check
676      * @param length
677      *            the number of bytes to check
678      * @return true, if this stream is a tar archive stream, false otherwise
679      */
matches(final byte[] signature, final int length)680     public static boolean matches(final byte[] signature, final int length) {
681         if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
682             return false;
683         }
684 
685         if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
686                 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
687             &&
688             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
689                 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
690                 ){
691             return true;
692         }
693         if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
694                 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
695             &&
696             (
697              ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
698                 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
699             ||
700             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
701                 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
702             )
703                 ){
704             return true;
705         }
706         // COMPRESS-107 - recognise Ant tar files
707         return ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
708                 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
709                 &&
710                 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
711                         signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN);
712     }
713 
714 }
715