001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz4;
020
021import java.io.IOException;
022import java.io.OutputStream;
023import java.util.Arrays;
024import java.util.Deque;
025import java.util.Iterator;
026import java.util.LinkedList;
027
028import org.apache.commons.compress.compressors.CompressorOutputStream;
029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor;
030import org.apache.commons.compress.compressors.lz77support.Parameters;
031import org.apache.commons.compress.utils.ByteUtils;
032
033/**
034 * CompressorOutputStream for the LZ4 block format.
035 *
036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a>
037 * @since 1.14
038 * @NotThreadSafe
039 */
040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream {
041
042    final static class Pair {
043        private static int lengths(final int litLength, final int brLength) {
044            final int l = Math.min(litLength, 15);
045            final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15;
046            return l << BlockLZ4CompressorInputStream.SIZE_BITS | br;
047        }
048        private static void writeLength(int length, final OutputStream out) throws IOException {
049            while (length >= 255) {
050                out.write(255);
051                length -= 255;
052            }
053            out.write(length);
054        }
055        private final Deque<byte[]> literals = new LinkedList<>();
056
057        private int literalLength;
058
059        private int brOffset, brLength;
060
061        private boolean written;
062
063        byte[] addLiteral(final LZ77Compressor.LiteralBlock block) {
064            final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(),
065                block.getOffset() + block.getLength());
066            literals.add(copy);
067            literalLength += copy.length;
068            return copy;
069        }
070
071        private int backReferenceLength() {
072            return brLength;
073        }
074
075        boolean canBeWritten(final int lengthOfBlocksAfterThisPair) {
076            return hasBackReference()
077                && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH;
078        }
079
080        boolean hasBackReference() {
081            return brOffset > 0;
082        }
083
084        private boolean hasBeenWritten() {
085            return written;
086        }
087
088        int length() {
089            return literalLength() + brLength;
090        }
091
092        private int literalLength() {
093            // This method is performance sensitive
094            if (literalLength != 0) {
095                return literalLength;
096            }
097            int length = 0;
098            for (final byte[] b : literals) {
099                length += b.length;
100            }
101            return literalLength = length;
102        }
103
104        private void prependLiteral(final byte[] data) {
105            literals.addFirst(data);
106            literalLength += data.length;
107        }
108
109        private void prependTo(final Pair other) {
110            final Iterator<byte[]> listBackwards = literals.descendingIterator();
111            while (listBackwards.hasNext()) {
112                other.prependLiteral(listBackwards.next());
113            }
114        }
115
116        void setBackReference(final LZ77Compressor.BackReference block) {
117            if (hasBackReference()) {
118                throw new IllegalStateException();
119            }
120            brOffset = block.getOffset();
121            brLength = block.getLength();
122        }
123
124        private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) {
125            final Pair p = new Pair();
126            p.literals.addAll(literals);
127            p.brOffset = brOffset;
128            p.brLength = newBackReferenceLength;
129            return p;
130        }
131
132        void writeTo(final OutputStream out) throws IOException {
133            final int litLength = literalLength();
134            out.write(lengths(litLength, brLength));
135            if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
136                writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
137            }
138            for (final byte[] b : literals) {
139                out.write(b);
140            }
141            if (hasBackReference()) {
142                ByteUtils.toLittleEndian(out, brOffset, 2);
143                if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) {
144                    writeLength(brLength - MIN_BACK_REFERENCE_LENGTH
145                        - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out);
146                }
147            }
148            written = true;
149        }
150    }
151    private static final int MIN_BACK_REFERENCE_LENGTH = 4;
152
153    /*
154
155      The LZ4 block format has a few properties that make it less
156      straight-forward than one would hope:
157
158      * literal blocks and back-references must come in pairs (except
159        for the very last literal block), so consecutive literal
160        blocks created by the compressor must be merged into a single
161        block.
162
163      * the start of a literal/back-reference pair contains the length
164        of the back-reference (at least some part of it) so we can't
165        start writing the literal before we know how long the next
166        back-reference is going to be.
167
168      * there are special rules for the final blocks
169
170        > There are specific parsing rules to respect in order to remain
171        > compatible with assumptions made by the decoder :
172        >
173        >     1. The last 5 bytes are always literals
174        >
175        >     2. The last match must start at least 12 bytes before end of
176        >        block. Consequently, a block with less than 13 bytes cannot be
177        >        compressed.
178
179        which means any back-reference may need to get rewritten as a
180        literal block unless we know the next block is at least of
181        length 5 and the sum of this block's length and offset and the
182        next block's length is at least twelve.
183
184    */
185
186    private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12;
187    /**
188     * Returns a builder correctly configured for the LZ4 algorithm.
189     * @return a builder correctly configured for the LZ4 algorithm
190     */
191    public static Parameters.Builder createParameterBuilder() {
192        final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1;
193        return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE)
194            .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH)
195            .withMaxBackReferenceLength(maxLen)
196            .withMaxOffset(maxLen)
197            .withMaxLiteralLength(maxLen);
198    }
199
200    private final LZ77Compressor compressor;
201
202    private final OutputStream os;
203
204    // used in one-arg write method
205    private final byte[] oneByte = new byte[1];
206    private boolean finished;
207
208    private final Deque<Pair> pairs = new LinkedList<>();
209
210    // keeps track of the last window-size bytes (64k) in order to be
211    // able to expand back-references when needed
212    private final Deque<byte[]> expandedBlocks = new LinkedList<>();
213
214    /**
215     * Creates a new LZ4 output stream.
216     *
217     * @param os
218     *            An OutputStream to read compressed data from
219     */
220    public BlockLZ4CompressorOutputStream(final OutputStream os) {
221        this(os, createParameterBuilder().build());
222    }
223
224    /**
225     * Creates a new LZ4 output stream.
226     *
227     * @param os
228     *            An OutputStream to read compressed data from
229     * @param params
230     *            The parameters to use for LZ77 compression.
231     */
232    public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) {
233        this.os = os;
234        compressor = new LZ77Compressor(params,
235            block -> {
236                switch (block.getType()) {
237                case LITERAL:
238                    addLiteralBlock((LZ77Compressor.LiteralBlock) block);
239                    break;
240                case BACK_REFERENCE:
241                    addBackReference((LZ77Compressor.BackReference) block);
242                    break;
243                case EOD:
244                    writeFinalLiteralBlock();
245                    break;
246                }
247            });
248    }
249
250    private void addBackReference(final LZ77Compressor.BackReference block) throws IOException {
251        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
252        last.setBackReference(block);
253        recordBackReference(block);
254        clearUnusedBlocksAndPairs();
255    }
256
257    private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException {
258        final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength());
259        recordLiteral(last.addLiteral(block));
260        clearUnusedBlocksAndPairs();
261    }
262
263    private void clearUnusedBlocks() {
264        int blockLengths = 0;
265        int blocksToKeep = 0;
266        for (final byte[] b : expandedBlocks) {
267            blocksToKeep++;
268            blockLengths += b.length;
269            if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
270                break;
271            }
272        }
273        final int size = expandedBlocks.size();
274        for (int i = blocksToKeep; i < size; i++) {
275            expandedBlocks.removeLast();
276        }
277    }
278
279    private void clearUnusedBlocksAndPairs() {
280        clearUnusedBlocks();
281        clearUnusedPairs();
282    }
283
284    private void clearUnusedPairs() {
285        int pairLengths = 0;
286        int pairsToKeep = 0;
287        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
288            final Pair p = it.next();
289            pairsToKeep++;
290            pairLengths += p.length();
291            if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) {
292                break;
293            }
294        }
295        final int size = pairs.size();
296        for (int i = pairsToKeep; i < size; i++) {
297            final Pair p = pairs.peekFirst();
298            if (!p.hasBeenWritten()) {
299                break;
300            }
301            pairs.removeFirst();
302        }
303    }
304
305    @Override
306    public void close() throws IOException {
307        try {
308            finish();
309        } finally {
310            os.close();
311        }
312    }
313
314    private byte[] expand(final int offset, final int length) {
315        final byte[] expanded = new byte[length];
316        if (offset == 1) { // surprisingly common special case
317            final byte[] block = expandedBlocks.peekFirst();
318            final byte b = block[block.length - 1];
319            if (b != 0) { // the fresh array contains 0s anyway
320                Arrays.fill(expanded, b);
321            }
322        } else {
323            expandFromList(expanded, offset, length);
324        }
325        return expanded;
326    }
327
328    private void expandFromList(final byte[] expanded, final int offset, final int length) {
329        int offsetRemaining = offset;
330        int lengthRemaining = length;
331        int writeOffset = 0;
332        while (lengthRemaining > 0) {
333            // find block that contains offsetRemaining
334            byte[] block = null;
335            final int copyLen;
336            final int copyOffset;
337            if (offsetRemaining > 0) {
338                int blockOffset = 0;
339                for (final byte[] b : expandedBlocks) {
340                    if (b.length + blockOffset >= offsetRemaining) {
341                        block = b;
342                        break;
343                    }
344                    blockOffset += b.length;
345                }
346                if (block == null) {
347                    // should not be possible
348                    throw new IllegalStateException("Failed to find a block containing offset " + offset);
349                }
350                copyOffset = blockOffset + block.length - offsetRemaining;
351                copyLen = Math.min(lengthRemaining, block.length - copyOffset);
352            } else {
353                // offsetRemaining is negative or 0 and points into the expanded bytes
354                block = expanded;
355                copyOffset = -offsetRemaining;
356                copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining);
357            }
358            System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen);
359            offsetRemaining -= copyLen;
360            lengthRemaining -= copyLen;
361            writeOffset += copyLen;
362        }
363    }
364
365    /**
366     * Compresses all remaining data and writes it to the stream,
367     * doesn't close the underlying stream.
368     * @throws IOException if an error occurs
369     */
370    public void finish() throws IOException {
371        if (!finished) {
372            compressor.finish();
373            finished = true;
374        }
375    }
376
377    /**
378     * Adds some initial data to fill the window with.
379     *
380     * @param data the data to fill the window with.
381     * @param off offset of real data into the array
382     * @param len amount of data
383     * @throws IllegalStateException if the stream has already started to write data
384     * @see LZ77Compressor#prefill
385     */
386    public void prefill(final byte[] data, final int off, final int len) {
387        if (len > 0) {
388            final byte[] b = Arrays.copyOfRange(data, off, off + len);
389            compressor.prefill(b);
390            recordLiteral(b);
391        }
392    }
393
394    private void recordBackReference(final LZ77Compressor.BackReference block) {
395        expandedBlocks.addFirst(expand(block.getOffset(), block.getLength()));
396    }
397
398    private void recordLiteral(final byte[] b) {
399        expandedBlocks.addFirst(b);
400    }
401
402    private void rewriteLastPairs() {
403        final LinkedList<Pair> lastPairs = new LinkedList<>();
404        final LinkedList<Integer> pairLength = new LinkedList<>();
405        int offset = 0;
406        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
407            final Pair p = it.next();
408            if (p.hasBeenWritten()) {
409                break;
410            }
411            final int len = p.length();
412            pairLength.addFirst(len);
413            lastPairs.addFirst(p);
414            offset += len;
415            if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) {
416                break;
417            }
418        }
419        lastPairs.forEach(pairs::remove);
420        // lastPairs may contain between one and four Pairs:
421        // * the last pair may be a one byte literal
422        // * all other Pairs contain a back-reference which must be four bytes long at minimum
423        // we could merge them all into a single literal block but
424        // this may harm compression. For example compressing
425        // "bla.tar" from our tests yields a last block containing a
426        // back-reference of length > 2k and we'd end up with a last
427        // literal of that size rather than a 2k back-reference and a
428        // 12 byte literal at the end.
429
430        // Instead we merge all but the first of lastPairs into a new
431        // literal-only Pair "replacement" and look at the
432        // back-reference in the first of lastPairs and see if we can
433        // split it. We can split it if it is longer than 16 -
434        // replacement.length (i.e. the minimal length of four is kept
435        // while making sure the last literal is at least twelve bytes
436        // long). If we can't split it, we expand the first of the pairs
437        // as well.
438
439        // this is not optimal, we could get better compression
440        // results with more complex approaches as the last literal
441        // only needs to be five bytes long if the previous
442        // back-reference has an offset big enough
443
444        final int lastPairsSize = lastPairs.size();
445        int toExpand = 0;
446        for (int i = 1; i < lastPairsSize; i++) {
447            toExpand += pairLength.get(i);
448        }
449        final Pair replacement = new Pair();
450        if (toExpand > 0) {
451            replacement.prependLiteral(expand(toExpand, toExpand));
452        }
453        final Pair splitCandidate = lastPairs.get(0);
454        final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand;
455        final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0;
456        if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) {
457            replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded));
458            pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded));
459        } else {
460            if (splitCandidate.hasBackReference()) {
461                replacement.prependLiteral(expand(toExpand + brLen, brLen));
462            }
463            splitCandidate.prependTo(replacement);
464        }
465        pairs.add(replacement);
466    }
467
468    @Override
469    public void write(final byte[] data, final int off, final int len) throws IOException {
470        compressor.compress(data, off, len);
471    }
472
473    @Override
474    public void write(final int b) throws IOException {
475        oneByte[0] = (byte) (b & 0xff);
476        write(oneByte);
477    }
478
479    private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException {
480        writeWritablePairs(length);
481        Pair last = pairs.peekLast();
482        if (last == null || last.hasBackReference()) {
483            last = new Pair();
484            pairs.addLast(last);
485        }
486        return last;
487    }
488
489    private void writeFinalLiteralBlock() throws IOException {
490        rewriteLastPairs();
491        for (final Pair p : pairs) {
492            if (!p.hasBeenWritten()) {
493                p.writeTo(os);
494            }
495        }
496        pairs.clear();
497    }
498
499    private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException {
500        int unwrittenLength = lengthOfBlocksAfterLastPair;
501        for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) {
502            final Pair p = it.next();
503            if (p.hasBeenWritten()) {
504                break;
505            }
506            unwrittenLength += p.length();
507        }
508        for (final Pair p : pairs) {
509            if (p.hasBeenWritten()) {
510                continue;
511            }
512            unwrittenLength -= p.length();
513            if (!p.canBeWritten(unwrittenLength)) {
514                break;
515            }
516            p.writeTo(os);
517        }
518    }
519}