inflate.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. /* vim: tabstop=4 shiftwidth=4 noexpandtab
  2. * This file is part of ToaruOS and is released under the terms
  3. * of the NCSA / University of Illinois License - see LICENSE.md
  4. * Copyright (C) 2020 K. Lange
  5. *
  6. * libtoaru_inflate: Methods for decompressing DEFLATE and gzip payloads.
  7. */
  8. #include <stdint.h>
  9. #include <stddef.h>
  10. #ifndef _BOOT_LOADER
  11. #include <toaru/inflate.h>
  12. #endif
  13. /**
  14. * Decoded Huffman table
  15. */
  16. struct huff {
  17. uint16_t counts[16]; /* Number of symbols of each length */
  18. uint16_t symbols[288]; /* Ordered symbols */
  19. };
  20. /**
  21. * 32K ringbuffer for backwards lookup
  22. */
  23. struct huff_ring {
  24. size_t pointer;
  25. uint8_t data[32768];
  26. };
  27. /**
  28. * Fixed Huffman code tables, generated later.
  29. */
  30. struct huff fixed_lengths;
  31. struct huff fixed_dists;
  32. /**
  33. * Read a little-endian short from the input.
  34. */
  35. static uint16_t read_16le(struct inflate_context * ctx) {
  36. uint16_t a, b;
  37. a = ctx->get_input(ctx);
  38. b = ctx->get_input(ctx);
  39. return (a << 0) | (b << 8);
  40. }
  41. /**
  42. * Read a single bit from the source.
  43. * Fills the byte buffer with one byte when it runs out.
  44. */
  45. static uint8_t read_bit(struct inflate_context * ctx) {
  46. /* When we run out of bits... */
  47. if (ctx->buffer_size == 0) {
  48. /* Refill from the next input byte */
  49. ctx->bit_buffer = ctx->get_input(ctx);
  50. /* And restore bit buffer size to 8 bits */
  51. ctx->buffer_size = 8;
  52. }
  53. /* Get the next available bit */
  54. int out = ctx->bit_buffer & 1;
  55. /* Shift the bit buffer forward */
  56. ctx->bit_buffer >>= 1;
  57. /* There is now one less bit available */
  58. ctx->buffer_size--;
  59. return out;
  60. }
  61. /**
  62. * Read multible bits, in bit order, from the source.
  63. */
  64. static uint32_t read_bits(struct inflate_context * ctx, unsigned int count) {
  65. uint32_t out = 0;
  66. for (unsigned int bit = 0; bit < count; bit++) {
  67. /* Read one bit at a time, from least to most significant */
  68. out |= (read_bit(ctx) << bit);
  69. }
  70. return out;
  71. }
  72. /**
  73. * Build a Huffman table from an array of lengths.
  74. */
  75. static void build_huffman(uint8_t * lengths, size_t size, struct huff * out) {
  76. uint16_t offsets[16];
  77. unsigned int count = 0;
  78. /* Zero symbol counts */
  79. for (unsigned int i = 0; i < 16; ++i) out->counts[i] = 0;
  80. /* Count symbols */
  81. for (unsigned int i = 0; i < size; ++i) out->counts[lengths[i]]++;
  82. /* Special case... */
  83. out->counts[0] = 0;
  84. /* Figure out offsets */
  85. for (unsigned int i = 0; i < 16; ++i) {
  86. offsets[i] = count;
  87. count += out->counts[i];
  88. }
  89. /* Build symbol ordering */
  90. for (unsigned int i = 0; i < size; ++i) {
  91. if (lengths[i]) out->symbols[offsets[lengths[i]]++] = i;
  92. }
  93. }
  94. /**
  95. * Build the fixed Huffman tables
  96. */
  97. static void build_fixed(void) {
  98. /* From 3.2.6:
  99. * Lit Value Bits Codes
  100. * --------- ---- -----
  101. * 0 - 143 8 00110000 through
  102. * 10111111
  103. * 144 - 255 9 110010000 through
  104. * 111111111
  105. * 256 - 279 7 0000000 through
  106. * 0010111
  107. * 280 - 287 8 11000000 through
  108. * 11000111
  109. */
  110. uint8_t lengths[288];
  111. for (int i = 0; i < 144; ++i) lengths[i] = 8;
  112. for (int i = 144; i < 256; ++i) lengths[i] = 9;
  113. for (int i = 256; i < 280; ++i) lengths[i] = 7;
  114. for (int i = 280; i < 288; ++i) lengths[i] = 8;
  115. build_huffman(lengths, 288, &fixed_lengths);
  116. /* Continued from 3.2.6:
  117. * Distance codes 0-31 are represented by (fixed-length) 5-bit
  118. * codes, with possible additional bits as shown in the table
  119. * shown in Paragraph 3.2.5, above. Note that distance codes 30-
  120. * 31 will never actually occur in the compressed data.
  121. */
  122. for (int i = 0; i < 30; ++i) lengths[i] = 5;
  123. build_huffman(lengths, 30, &fixed_dists);
  124. }
  125. /**
  126. * Decode a symbol from the source using a Huffman table.
  127. */
  128. static int decode(struct inflate_context * ctx, struct huff * huff) {
  129. int count = 0, cur = 0;
  130. for (int i = 1; cur >= 0; i++) {
  131. cur = (cur << 1) | read_bit(ctx); /* Shift */
  132. count += huff->counts[i];
  133. cur -= huff->counts[i];
  134. }
  135. return huff->symbols[count + cur];
  136. }
  137. /**
  138. * Emit one byte to the output, maintaining the ringbuffer.
  139. * The ringbuffer ensures we can always look back 32K bytes
  140. * while keeping output streaming.
  141. */
  142. static void emit(struct inflate_context * ctx, unsigned char byte) {
  143. if (ctx->ring->pointer == 32768) {
  144. ctx->ring->pointer = 0;
  145. }
  146. ctx->ring->data[ctx->ring->pointer] = byte;
  147. ctx->write_output(ctx, byte);
  148. ctx->ring->pointer++;
  149. }
  150. /**
  151. * Look backwards in the output ring buffer.
  152. */
  153. static uint8_t peek(struct inflate_context * ctx, int offset) {
  154. return ctx->ring->data[(ctx->ring->pointer - offset) % 32768];
  155. }
  156. /**
  157. * Decompress a block of Huffman-encoded data.
  158. */
  159. static int inflate(struct inflate_context * ctx, struct huff * huff_len, struct huff * huff_dist) {
  160. /* These are the extra bits for lengths from the tables in section 3.2.5
  161. * Extra Extra Extra
  162. * Code Bits Length(s) Code Bits Lengths Code Bits Length(s)
  163. * ---- ---- ------ ---- ---- ------- ---- ---- -------
  164. * 257 0 3 267 1 15,16 277 4 67-82
  165. * 258 0 4 268 1 17,18 278 4 83-98
  166. * 259 0 5 269 2 19-22 279 4 99-114
  167. * 260 0 6 270 2 23-26 280 4 115-130
  168. * 261 0 7 271 2 27-30 281 5 131-162
  169. * 262 0 8 272 2 31-34 282 5 163-194
  170. * 263 0 9 273 3 35-42 283 5 195-226
  171. * 264 0 10 274 3 43-50 284 5 227-257
  172. * 265 1 11,12 275 3 51-58 285 0 258
  173. * 266 1 13,14 276 3 59-66
  174. */
  175. static const uint16_t lens[] = {
  176. 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51,
  177. 59, 67, 83, 99, 115, 131, 163, 195, 227, 258
  178. };
  179. static const uint16_t lext[] = {
  180. 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
  181. 4, 5, 5, 5, 5, 0
  182. };
  183. /* Extra bits for distances....
  184. * Extra Extra Extra
  185. * Code Bits Dist Code Bits Dist Code Bits Distance
  186. * ---- ---- ---- ---- ---- ------ ---- ---- --------
  187. * 0 0 1 10 4 33-48 20 9 1025-1536
  188. * 1 0 2 11 4 49-64 21 9 1537-2048
  189. * 2 0 3 12 5 65-96 22 10 2049-3072
  190. * 3 0 4 13 5 97-128 23 10 3073-4096
  191. * 4 1 5,6 14 6 129-192 24 11 4097-6144
  192. * 5 1 7,8 15 6 193-256 25 11 6145-8192
  193. * 6 2 9-12 16 7 257-384 26 12 8193-12288
  194. * 7 2 13-16 17 7 385-512 27 12 12289-16384
  195. * 8 3 17-24 18 8 513-768 28 13 16385-24576
  196. * 9 3 25-32 19 8 769-1024 29 13 24577-32768
  197. */
  198. static const uint16_t dists[] = {
  199. 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385,
  200. 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577
  201. };
  202. static const uint16_t dext[] = {
  203. 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
  204. 10, 11, 11, 12, 12, 13, 13
  205. };
  206. while (1) {
  207. int symbol = decode(ctx, huff_len);
  208. if (symbol == 256) {
  209. break;
  210. }
  211. if (symbol < 256) {
  212. emit(ctx, symbol);
  213. } else if (symbol == 256) {
  214. /* "The literal/length symbol 256 (end of data), ..." */
  215. break;
  216. } else {
  217. int length, distance, offset;
  218. symbol -= 257;
  219. length = read_bits(ctx, lext[symbol]) + lens[symbol];
  220. distance = decode(ctx, huff_dist);
  221. offset = read_bits(ctx, dext[distance]) + dists[distance];
  222. for (int i = 0; i < length; ++i) {
  223. uint8_t b = peek(ctx, offset);
  224. emit(ctx, b);
  225. }
  226. }
  227. }
  228. return 0;
  229. }
  230. /**
  231. * Decode a dynamic Huffman block.
  232. */
  233. static void decode_huffman(struct inflate_context * ctx) {
  234. /* Ordering of code length codes:
  235. * (HCLEN + 4) x 3 bits: code lengths for the code length
  236. * alphabet given just above, in the order: ...
  237. */
  238. static const uint8_t clens[] = {
  239. 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
  240. };
  241. unsigned int literals, distances, clengths;
  242. uint8_t lengths[320] = {0};
  243. literals = 257 + read_bits(ctx, 5); /* 5 Bits: HLIT ... 257 */
  244. distances = 1 + read_bits(ctx, 5); /* 5 Bits: HDIST ... 1 */
  245. clengths = 4 + read_bits(ctx, 4); /* 4 Bits: HCLEN ... 4 */
  246. /* (HCLEN + 4) x 3 bits... */
  247. for (unsigned int i = 0; i < clengths; ++i) {
  248. lengths[clens[i]] = read_bits(ctx, 3);
  249. }
  250. struct huff codes;
  251. build_huffman(lengths, 19, &codes);
  252. /* Decode symbols:
  253. * HLIT + 257 code lengths for the literal/length alphabet...
  254. * HDIST + 1 code lengths for the distance alphabet...
  255. */
  256. unsigned int count = 0;
  257. while (count < literals + distances) {
  258. int symbol = decode(ctx, &codes);
  259. if (symbol < 16) {
  260. /* 0 - 15: Represent code lengths of 0-15 */
  261. lengths[count++] = symbol;
  262. } else if (symbol < 19) {
  263. int rep = 0, length;
  264. if (symbol == 16) {
  265. /* 16: Copy the previous code length 3-6 times */
  266. rep = lengths[count-1];
  267. length = read_bits(ctx, 2) + 3; /* The next 2 bits indicate repeat length */
  268. } else if (symbol == 17) {
  269. /* Repeat a code length of 0 for 3 - 10 times */
  270. length = read_bits(ctx, 3) + 3; /* 3 bits of length */
  271. } else if (symbol == 18) {
  272. /* Repeat a code length of 0 for 11 - 138 times */
  273. length = read_bits(ctx, 7) + 11; /* 7 bits of length */
  274. }
  275. do {
  276. lengths[count++] = rep;
  277. length--;
  278. } while (length);
  279. } else {
  280. break;
  281. }
  282. }
  283. /* Build tables from lenghts decoded above */
  284. struct huff huff_len;
  285. build_huffman(lengths, literals, &huff_len);
  286. struct huff huff_dist;
  287. build_huffman(lengths + literals, distances, &huff_dist);
  288. inflate(ctx, &huff_len, &huff_dist);
  289. }
  290. /**
  291. * Decode an uncompressed block.
  292. */
  293. static int uncompressed(struct inflate_context * ctx) {
  294. /* Reset byte alignment */
  295. ctx->bit_buffer = 0;
  296. ctx->buffer_size = 0;
  297. /* "The rest of the block consists of the following information:"
  298. * 0 1 2 3 4...
  299. * +---+---+---+---+================================+
  300. * | LEN | NLEN |... LEN bytes of literal data...|
  301. * +---+---+---+---+================================+
  302. */
  303. uint16_t len = read_16le(ctx); /* "the number of data bytes in the block" */
  304. uint16_t nlen = read_16le(ctx); /* "the one's complement of LEN */
  305. /* Sanity check - does the ones-complement length actually match? */
  306. if ((nlen & 0xFFFF) != (~len & 0xFFFF)) {
  307. return 1;
  308. }
  309. /* Emit LEN bytes from the source to the output */
  310. for (int i = 0; i < len; ++i) {
  311. emit(ctx, ctx->get_input(ctx));
  312. }
  313. return 0;
  314. }
  315. static struct huff_ring data = {0, {0}};
  316. /**
  317. * Decompress DEFLATE-compressed data.
  318. */
  319. int deflate_decompress(struct inflate_context * ctx) {
  320. ctx->bit_buffer = 0;
  321. ctx->buffer_size = 0;
  322. build_fixed();
  323. if (!ctx->ring) {
  324. ctx->ring = &data;
  325. }
  326. /* read compressed data */
  327. while (1) {
  328. /* Read bit */
  329. int is_final = read_bit(ctx);
  330. int type = read_bits(ctx, 2);
  331. switch (type) {
  332. case 0x00: /* BTYPE=00 Non-compressed blocks */
  333. uncompressed(ctx);
  334. break;
  335. case 0x01: /* BYTPE=01 Compressed with fixed Huffman codes */
  336. inflate(ctx, &fixed_lengths, &fixed_dists);
  337. break;
  338. case 0x02: /* BTYPE=02 Compression with dynamic Huffman codes */
  339. decode_huffman(ctx);
  340. break;
  341. case 0x03:
  342. return 1;
  343. }
  344. if (is_final) {
  345. break;
  346. }
  347. }
  348. return 0;
  349. }
  350. #define GZIP_FLAG_TEXT (1 << 0)
  351. #define GZIP_FLAG_HCRC (1 << 1)
  352. #define GZIP_FLAG_EXTR (1 << 2)
  353. #define GZIP_FLAG_NAME (1 << 3)
  354. #define GZIP_FLAG_COMM (1 << 4)
  355. static unsigned int read_32le(struct inflate_context * ctx) {
  356. unsigned int a, b, c, d;
  357. a = ctx->get_input(ctx);
  358. b = ctx->get_input(ctx);
  359. c = ctx->get_input(ctx);
  360. d = ctx->get_input(ctx);
  361. return (d << 24) | (c << 16) | (b << 8) | (a << 0);
  362. }
  363. int gzip_decompress(struct inflate_context * ctx) {
  364. /* Read gzip headers */
  365. if (ctx->get_input(ctx) != 0x1F) return 1;
  366. if (ctx->get_input(ctx) != 0x8B) return 1;
  367. unsigned int flags = ctx->get_input(ctx);
  368. /* Read mtime */
  369. unsigned int mtime = read_32le(ctx);
  370. (void)mtime;
  371. /* Read extra flags */
  372. unsigned int xflags = ctx->get_input(ctx);
  373. (void)xflags;
  374. /* Read and discord OS flag */
  375. unsigned int os = ctx->get_input(ctx);
  376. (void)os;
  377. /* Extra bytes */
  378. if (flags & GZIP_FLAG_EXTR) {
  379. unsigned short size = read_16le(ctx);
  380. for (unsigned int i = 0; i < size; ++i) ctx->get_input(ctx);
  381. }
  382. if (flags & GZIP_FLAG_NAME) {
  383. unsigned int c;
  384. while ((c = ctx->get_input(ctx)) != 0);
  385. }
  386. if (flags & GZIP_FLAG_COMM) {
  387. unsigned int c;
  388. while ((c = ctx->get_input(ctx)) != 0);
  389. }
  390. unsigned int crc16 = 0;
  391. if (flags & GZIP_FLAG_HCRC) {
  392. crc16 = read_16le(ctx);
  393. }
  394. (void)crc16;
  395. deflate_decompress(ctx);
  396. /* Read CRC and decompressed size from end of input */
  397. unsigned int crc32 = read_32le(ctx);
  398. unsigned int dsize = read_32le(ctx);
  399. (void)crc32;
  400. (void)dsize;
  401. return 0;
  402. }