001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz4; 020 021import java.io.IOException; 022import java.io.OutputStream; 023import java.util.Arrays; 024import java.util.Deque; 025import java.util.Iterator; 026import java.util.LinkedList; 027 028import org.apache.commons.compress.compressors.CompressorOutputStream; 029import org.apache.commons.compress.compressors.lz77support.LZ77Compressor; 030import org.apache.commons.compress.compressors.lz77support.Parameters; 031import org.apache.commons.compress.utils.ByteUtils; 032 033/** 034 * CompressorOutputStream for the LZ4 block format. 035 * 036 * @see <a href="http://lz4.github.io/lz4/lz4_Block_format.html">LZ4 Block Format Description</a> 037 * @since 1.14 038 * @NotThreadSafe 039 */ 040public class BlockLZ4CompressorOutputStream extends CompressorOutputStream { 041 042 final static class Pair { 043 private static int lengths(final int litLength, final int brLength) { 044 final int l = Math.min(litLength, 15); 045 final int br = brLength < 4 ? 0 : brLength < 19 ? brLength - 4 : 15; 046 return l << BlockLZ4CompressorInputStream.SIZE_BITS | br; 047 } 048 private static void writeLength(int length, final OutputStream out) throws IOException { 049 while (length >= 255) { 050 out.write(255); 051 length -= 255; 052 } 053 out.write(length); 054 } 055 private final Deque<byte[]> literals = new LinkedList<>(); 056 057 private int literalLength; 058 059 private int brOffset, brLength; 060 061 private boolean written; 062 063 byte[] addLiteral(final LZ77Compressor.LiteralBlock block) { 064 final byte[] copy = Arrays.copyOfRange(block.getData(), block.getOffset(), 065 block.getOffset() + block.getLength()); 066 literals.add(copy); 067 literalLength += copy.length; 068 return copy; 069 } 070 071 private int backReferenceLength() { 072 return brLength; 073 } 074 075 boolean canBeWritten(final int lengthOfBlocksAfterThisPair) { 076 return hasBackReference() 077 && lengthOfBlocksAfterThisPair >= MIN_OFFSET_OF_LAST_BACK_REFERENCE + MIN_BACK_REFERENCE_LENGTH; 078 } 079 080 boolean hasBackReference() { 081 return brOffset > 0; 082 } 083 084 private boolean hasBeenWritten() { 085 return written; 086 } 087 088 int length() { 089 return literalLength() + brLength; 090 } 091 092 private int literalLength() { 093 // This method is performance sensitive 094 if (literalLength != 0) { 095 return literalLength; 096 } 097 int length = 0; 098 for (final byte[] b : literals) { 099 length += b.length; 100 } 101 return literalLength = length; 102 } 103 104 private void prependLiteral(final byte[] data) { 105 literals.addFirst(data); 106 literalLength += data.length; 107 } 108 109 private void prependTo(final Pair other) { 110 final Iterator<byte[]> listBackwards = literals.descendingIterator(); 111 while (listBackwards.hasNext()) { 112 other.prependLiteral(listBackwards.next()); 113 } 114 } 115 116 void setBackReference(final LZ77Compressor.BackReference block) { 117 if (hasBackReference()) { 118 throw new IllegalStateException(); 119 } 120 brOffset = block.getOffset(); 121 brLength = block.getLength(); 122 } 123 124 private Pair splitWithNewBackReferenceLengthOf(final int newBackReferenceLength) { 125 final Pair p = new Pair(); 126 p.literals.addAll(literals); 127 p.brOffset = brOffset; 128 p.brLength = newBackReferenceLength; 129 return p; 130 } 131 132 void writeTo(final OutputStream out) throws IOException { 133 final int litLength = literalLength(); 134 out.write(lengths(litLength, brLength)); 135 if (litLength >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 136 writeLength(litLength - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 137 } 138 for (final byte[] b : literals) { 139 out.write(b); 140 } 141 if (hasBackReference()) { 142 ByteUtils.toLittleEndian(out, brOffset, 2); 143 if (brLength - MIN_BACK_REFERENCE_LENGTH >= BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK) { 144 writeLength(brLength - MIN_BACK_REFERENCE_LENGTH 145 - BlockLZ4CompressorInputStream.BACK_REFERENCE_SIZE_MASK, out); 146 } 147 } 148 written = true; 149 } 150 } 151 private static final int MIN_BACK_REFERENCE_LENGTH = 4; 152 153 /* 154 155 The LZ4 block format has a few properties that make it less 156 straight-forward than one would hope: 157 158 * literal blocks and back-references must come in pairs (except 159 for the very last literal block), so consecutive literal 160 blocks created by the compressor must be merged into a single 161 block. 162 163 * the start of a literal/back-reference pair contains the length 164 of the back-reference (at least some part of it) so we can't 165 start writing the literal before we know how long the next 166 back-reference is going to be. 167 168 * there are special rules for the final blocks 169 170 > There are specific parsing rules to respect in order to remain 171 > compatible with assumptions made by the decoder : 172 > 173 > 1. The last 5 bytes are always literals 174 > 175 > 2. The last match must start at least 12 bytes before end of 176 > block. Consequently, a block with less than 13 bytes cannot be 177 > compressed. 178 179 which means any back-reference may need to get rewritten as a 180 literal block unless we know the next block is at least of 181 length 5 and the sum of this block's length and offset and the 182 next block's length is at least twelve. 183 184 */ 185 186 private static final int MIN_OFFSET_OF_LAST_BACK_REFERENCE = 12; 187 /** 188 * Returns a builder correctly configured for the LZ4 algorithm. 189 * @return a builder correctly configured for the LZ4 algorithm 190 */ 191 public static Parameters.Builder createParameterBuilder() { 192 final int maxLen = BlockLZ4CompressorInputStream.WINDOW_SIZE - 1; 193 return Parameters.builder(BlockLZ4CompressorInputStream.WINDOW_SIZE) 194 .withMinBackReferenceLength(MIN_BACK_REFERENCE_LENGTH) 195 .withMaxBackReferenceLength(maxLen) 196 .withMaxOffset(maxLen) 197 .withMaxLiteralLength(maxLen); 198 } 199 200 private final LZ77Compressor compressor; 201 202 private final OutputStream os; 203 204 // used in one-arg write method 205 private final byte[] oneByte = new byte[1]; 206 private boolean finished; 207 208 private final Deque<Pair> pairs = new LinkedList<>(); 209 210 // keeps track of the last window-size bytes (64k) in order to be 211 // able to expand back-references when needed 212 private final Deque<byte[]> expandedBlocks = new LinkedList<>(); 213 214 /** 215 * Creates a new LZ4 output stream. 216 * 217 * @param os 218 * An OutputStream to read compressed data from 219 */ 220 public BlockLZ4CompressorOutputStream(final OutputStream os) { 221 this(os, createParameterBuilder().build()); 222 } 223 224 /** 225 * Creates a new LZ4 output stream. 226 * 227 * @param os 228 * An OutputStream to read compressed data from 229 * @param params 230 * The parameters to use for LZ77 compression. 231 */ 232 public BlockLZ4CompressorOutputStream(final OutputStream os, final Parameters params) { 233 this.os = os; 234 compressor = new LZ77Compressor(params, 235 block -> { 236 switch (block.getType()) { 237 case LITERAL: 238 addLiteralBlock((LZ77Compressor.LiteralBlock) block); 239 break; 240 case BACK_REFERENCE: 241 addBackReference((LZ77Compressor.BackReference) block); 242 break; 243 case EOD: 244 writeFinalLiteralBlock(); 245 break; 246 } 247 }); 248 } 249 250 private void addBackReference(final LZ77Compressor.BackReference block) throws IOException { 251 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 252 last.setBackReference(block); 253 recordBackReference(block); 254 clearUnusedBlocksAndPairs(); 255 } 256 257 private void addLiteralBlock(final LZ77Compressor.LiteralBlock block) throws IOException { 258 final Pair last = writeBlocksAndReturnUnfinishedPair(block.getLength()); 259 recordLiteral(last.addLiteral(block)); 260 clearUnusedBlocksAndPairs(); 261 } 262 263 private void clearUnusedBlocks() { 264 int blockLengths = 0; 265 int blocksToKeep = 0; 266 for (final byte[] b : expandedBlocks) { 267 blocksToKeep++; 268 blockLengths += b.length; 269 if (blockLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 270 break; 271 } 272 } 273 final int size = expandedBlocks.size(); 274 for (int i = blocksToKeep; i < size; i++) { 275 expandedBlocks.removeLast(); 276 } 277 } 278 279 private void clearUnusedBlocksAndPairs() { 280 clearUnusedBlocks(); 281 clearUnusedPairs(); 282 } 283 284 private void clearUnusedPairs() { 285 int pairLengths = 0; 286 int pairsToKeep = 0; 287 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 288 final Pair p = it.next(); 289 pairsToKeep++; 290 pairLengths += p.length(); 291 if (pairLengths >= BlockLZ4CompressorInputStream.WINDOW_SIZE) { 292 break; 293 } 294 } 295 final int size = pairs.size(); 296 for (int i = pairsToKeep; i < size; i++) { 297 final Pair p = pairs.peekFirst(); 298 if (!p.hasBeenWritten()) { 299 break; 300 } 301 pairs.removeFirst(); 302 } 303 } 304 305 @Override 306 public void close() throws IOException { 307 try { 308 finish(); 309 } finally { 310 os.close(); 311 } 312 } 313 314 private byte[] expand(final int offset, final int length) { 315 final byte[] expanded = new byte[length]; 316 if (offset == 1) { // surprisingly common special case 317 final byte[] block = expandedBlocks.peekFirst(); 318 final byte b = block[block.length - 1]; 319 if (b != 0) { // the fresh array contains 0s anyway 320 Arrays.fill(expanded, b); 321 } 322 } else { 323 expandFromList(expanded, offset, length); 324 } 325 return expanded; 326 } 327 328 private void expandFromList(final byte[] expanded, final int offset, final int length) { 329 int offsetRemaining = offset; 330 int lengthRemaining = length; 331 int writeOffset = 0; 332 while (lengthRemaining > 0) { 333 // find block that contains offsetRemaining 334 byte[] block = null; 335 final int copyLen; 336 final int copyOffset; 337 if (offsetRemaining > 0) { 338 int blockOffset = 0; 339 for (final byte[] b : expandedBlocks) { 340 if (b.length + blockOffset >= offsetRemaining) { 341 block = b; 342 break; 343 } 344 blockOffset += b.length; 345 } 346 if (block == null) { 347 // should not be possible 348 throw new IllegalStateException("Failed to find a block containing offset " + offset); 349 } 350 copyOffset = blockOffset + block.length - offsetRemaining; 351 copyLen = Math.min(lengthRemaining, block.length - copyOffset); 352 } else { 353 // offsetRemaining is negative or 0 and points into the expanded bytes 354 block = expanded; 355 copyOffset = -offsetRemaining; 356 copyLen = Math.min(lengthRemaining, writeOffset + offsetRemaining); 357 } 358 System.arraycopy(block, copyOffset, expanded, writeOffset, copyLen); 359 offsetRemaining -= copyLen; 360 lengthRemaining -= copyLen; 361 writeOffset += copyLen; 362 } 363 } 364 365 /** 366 * Compresses all remaining data and writes it to the stream, 367 * doesn't close the underlying stream. 368 * @throws IOException if an error occurs 369 */ 370 public void finish() throws IOException { 371 if (!finished) { 372 compressor.finish(); 373 finished = true; 374 } 375 } 376 377 /** 378 * Adds some initial data to fill the window with. 379 * 380 * @param data the data to fill the window with. 381 * @param off offset of real data into the array 382 * @param len amount of data 383 * @throws IllegalStateException if the stream has already started to write data 384 * @see LZ77Compressor#prefill 385 */ 386 public void prefill(final byte[] data, final int off, final int len) { 387 if (len > 0) { 388 final byte[] b = Arrays.copyOfRange(data, off, off + len); 389 compressor.prefill(b); 390 recordLiteral(b); 391 } 392 } 393 394 private void recordBackReference(final LZ77Compressor.BackReference block) { 395 expandedBlocks.addFirst(expand(block.getOffset(), block.getLength())); 396 } 397 398 private void recordLiteral(final byte[] b) { 399 expandedBlocks.addFirst(b); 400 } 401 402 private void rewriteLastPairs() { 403 final LinkedList<Pair> lastPairs = new LinkedList<>(); 404 final LinkedList<Integer> pairLength = new LinkedList<>(); 405 int offset = 0; 406 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 407 final Pair p = it.next(); 408 if (p.hasBeenWritten()) { 409 break; 410 } 411 final int len = p.length(); 412 pairLength.addFirst(len); 413 lastPairs.addFirst(p); 414 offset += len; 415 if (offset >= MIN_OFFSET_OF_LAST_BACK_REFERENCE) { 416 break; 417 } 418 } 419 lastPairs.forEach(pairs::remove); 420 // lastPairs may contain between one and four Pairs: 421 // * the last pair may be a one byte literal 422 // * all other Pairs contain a back-reference which must be four bytes long at minimum 423 // we could merge them all into a single literal block but 424 // this may harm compression. For example compressing 425 // "bla.tar" from our tests yields a last block containing a 426 // back-reference of length > 2k and we'd end up with a last 427 // literal of that size rather than a 2k back-reference and a 428 // 12 byte literal at the end. 429 430 // Instead we merge all but the first of lastPairs into a new 431 // literal-only Pair "replacement" and look at the 432 // back-reference in the first of lastPairs and see if we can 433 // split it. We can split it if it is longer than 16 - 434 // replacement.length (i.e. the minimal length of four is kept 435 // while making sure the last literal is at least twelve bytes 436 // long). If we can't split it, we expand the first of the pairs 437 // as well. 438 439 // this is not optimal, we could get better compression 440 // results with more complex approaches as the last literal 441 // only needs to be five bytes long if the previous 442 // back-reference has an offset big enough 443 444 final int lastPairsSize = lastPairs.size(); 445 int toExpand = 0; 446 for (int i = 1; i < lastPairsSize; i++) { 447 toExpand += pairLength.get(i); 448 } 449 final Pair replacement = new Pair(); 450 if (toExpand > 0) { 451 replacement.prependLiteral(expand(toExpand, toExpand)); 452 } 453 final Pair splitCandidate = lastPairs.get(0); 454 final int stillNeeded = MIN_OFFSET_OF_LAST_BACK_REFERENCE - toExpand; 455 final int brLen = splitCandidate.hasBackReference() ? splitCandidate.backReferenceLength() : 0; 456 if (splitCandidate.hasBackReference() && brLen >= MIN_BACK_REFERENCE_LENGTH + stillNeeded) { 457 replacement.prependLiteral(expand(toExpand + stillNeeded, stillNeeded)); 458 pairs.add(splitCandidate.splitWithNewBackReferenceLengthOf(brLen - stillNeeded)); 459 } else { 460 if (splitCandidate.hasBackReference()) { 461 replacement.prependLiteral(expand(toExpand + brLen, brLen)); 462 } 463 splitCandidate.prependTo(replacement); 464 } 465 pairs.add(replacement); 466 } 467 468 @Override 469 public void write(final byte[] data, final int off, final int len) throws IOException { 470 compressor.compress(data, off, len); 471 } 472 473 @Override 474 public void write(final int b) throws IOException { 475 oneByte[0] = (byte) (b & 0xff); 476 write(oneByte); 477 } 478 479 private Pair writeBlocksAndReturnUnfinishedPair(final int length) throws IOException { 480 writeWritablePairs(length); 481 Pair last = pairs.peekLast(); 482 if (last == null || last.hasBackReference()) { 483 last = new Pair(); 484 pairs.addLast(last); 485 } 486 return last; 487 } 488 489 private void writeFinalLiteralBlock() throws IOException { 490 rewriteLastPairs(); 491 for (final Pair p : pairs) { 492 if (!p.hasBeenWritten()) { 493 p.writeTo(os); 494 } 495 } 496 pairs.clear(); 497 } 498 499 private void writeWritablePairs(final int lengthOfBlocksAfterLastPair) throws IOException { 500 int unwrittenLength = lengthOfBlocksAfterLastPair; 501 for (final Iterator<Pair> it = pairs.descendingIterator(); it.hasNext(); ) { 502 final Pair p = it.next(); 503 if (p.hasBeenWritten()) { 504 break; 505 } 506 unwrittenLength += p.length(); 507 } 508 for (final Pair p : pairs) { 509 if (p.hasBeenWritten()) { 510 continue; 511 } 512 unwrittenLength -= p.length(); 513 if (!p.canBeWritten(unwrittenLength)) { 514 break; 515 } 516 p.writeTo(os); 517 } 518 } 519}