| /* |
| * Branch/Call/Jump (BCJ) filter decoders |
| * |
| * Authors: Lasse Collin <[email protected]> |
| * Igor Pavlov <https://7-zip.org/> |
| * |
| * This file has been put into the public domain. |
| * You can do whatever you want with this file. |
| */ |
| |
| #include "xz_private.h" |
| |
| /* |
| * The rest of the file is inside this ifdef. It makes things a little more |
| * convenient when building without support for any BCJ filters. |
| */ |
| #ifdef XZ_DEC_BCJ |
| |
| struct xz_dec_bcj { |
| /* Type of the BCJ filter being used */ |
| enum { |
| BCJ_X86 = 4, /* x86 or x86-64 */ |
| BCJ_POWERPC = 5, /* Big endian only */ |
| BCJ_IA64 = 6, /* Big or little endian */ |
| BCJ_ARM = 7, /* Little endian only */ |
| BCJ_ARMTHUMB = 8, /* Little endian only */ |
| BCJ_SPARC = 9, /* Big or little endian */ |
| BCJ_ARM64 = 10 /* AArch64 */ |
| } type; |
| |
| /* |
| * Return value of the next filter in the chain. We need to preserve |
| * this information across calls, because we must not call the next |
| * filter anymore once it has returned XZ_STREAM_END. |
| */ |
| enum xz_ret ret; |
| |
| /* True if we are operating in single-call mode. */ |
| bool single_call; |
| |
| /* |
| * Absolute position relative to the beginning of the uncompressed |
| * data (in a single .xz Block). We care only about the lowest 32 |
| * bits so this doesn't need to be uint64_t even with big files. |
| */ |
| uint32_t pos; |
| |
| /* x86 filter state */ |
| uint32_t x86_prev_mask; |
| |
| /* Temporary space to hold the variables from struct xz_buf */ |
| uint8_t *out; |
| size_t out_pos; |
| size_t out_size; |
| |
| struct { |
| /* Amount of already filtered data in the beginning of buf */ |
| size_t filtered; |
| |
| /* Total amount of data currently stored in buf */ |
| size_t size; |
| |
| /* |
| * Buffer to hold a mix of filtered and unfiltered data. This |
| * needs to be big enough to hold Alignment + 2 * Look-ahead: |
| * |
| * Type Alignment Look-ahead |
| * x86 1 4 |
| * PowerPC 4 0 |
| * IA-64 16 0 |
| * ARM 4 0 |
| * ARM-Thumb 2 2 |
| * SPARC 4 0 |
| */ |
| uint8_t buf[16]; |
| } temp; |
| }; |
| |
| #ifdef XZ_DEC_X86 |
| /* |
| * This is used to test the most significant byte of a memory address |
| * in an x86 instruction. |
| */ |
| static inline int bcj_x86_test_msbyte(uint8_t b) |
| { |
| return b == 0x00 || b == 0xFF; |
| } |
| |
| static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| static const bool mask_to_allowed_status[8] |
| = { true, true, true, false, true, false, false, false }; |
| |
| static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 }; |
| |
| size_t i; |
| size_t prev_pos = (size_t)-1; |
| uint32_t prev_mask = s->x86_prev_mask; |
| uint32_t src; |
| uint32_t dest; |
| uint32_t j; |
| uint8_t b; |
| |
| if (size <= 4) |
| return 0; |
| |
| size -= 4; |
| for (i = 0; i < size; ++i) { |
| if ((buf[i] & 0xFE) != 0xE8) |
| continue; |
| |
| prev_pos = i - prev_pos; |
| if (prev_pos > 3) { |
| prev_mask = 0; |
| } else { |
| prev_mask = (prev_mask << (prev_pos - 1)) & 7; |
| if (prev_mask != 0) { |
| b = buf[i + 4 - mask_to_bit_num[prev_mask]]; |
| if (!mask_to_allowed_status[prev_mask] |
| || bcj_x86_test_msbyte(b)) { |
| prev_pos = i; |
| prev_mask = (prev_mask << 1) | 1; |
| continue; |
| } |
| } |
| } |
| |
| prev_pos = i; |
| |
| if (bcj_x86_test_msbyte(buf[i + 4])) { |
| src = get_unaligned_le32(buf + i + 1); |
| while (true) { |
| dest = src - (s->pos + (uint32_t)i + 5); |
| if (prev_mask == 0) |
| break; |
| |
| j = mask_to_bit_num[prev_mask] * 8; |
| b = (uint8_t)(dest >> (24 - j)); |
| if (!bcj_x86_test_msbyte(b)) |
| break; |
| |
| src = dest ^ (((uint32_t)1 << (32 - j)) - 1); |
| } |
| |
| dest &= 0x01FFFFFF; |
| dest |= (uint32_t)0 - (dest & 0x01000000); |
| put_unaligned_le32(dest, buf + i + 1); |
| i += 4; |
| } else { |
| prev_mask = (prev_mask << 1) | 1; |
| } |
| } |
| |
| prev_pos = i - prev_pos; |
| s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1); |
| return i; |
| } |
| #endif |
| |
| #ifdef XZ_DEC_POWERPC |
| static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| size_t i; |
| uint32_t instr; |
| |
| for (i = 0; i + 4 <= size; i += 4) { |
| instr = get_unaligned_be32(buf + i); |
| if ((instr & 0xFC000003) == 0x48000001) { |
| instr &= 0x03FFFFFC; |
| instr -= s->pos + (uint32_t)i; |
| instr &= 0x03FFFFFC; |
| instr |= 0x48000001; |
| put_unaligned_be32(instr, buf + i); |
| } |
| } |
| |
| return i; |
| } |
| #endif |
| |
| #ifdef XZ_DEC_IA64 |
| static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| static const uint8_t branch_table[32] = { |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 0, 0, 0, 0, 0, 0, 0, 0, |
| 4, 4, 6, 6, 0, 0, 7, 7, |
| 4, 4, 0, 0, 4, 4, 0, 0 |
| }; |
| |
| /* |
| * The local variables take a little bit stack space, but it's less |
| * than what LZMA2 decoder takes, so it doesn't make sense to reduce |
| * stack usage here without doing that for the LZMA2 decoder too. |
| */ |
| |
| /* Loop counters */ |
| size_t i; |
| size_t j; |
| |
| /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */ |
| uint32_t slot; |
| |
| /* Bitwise offset of the instruction indicated by slot */ |
| uint32_t bit_pos; |
| |
| /* bit_pos split into byte and bit parts */ |
| uint32_t byte_pos; |
| uint32_t bit_res; |
| |
| /* Address part of an instruction */ |
| uint32_t addr; |
| |
| /* Mask used to detect which instructions to convert */ |
| uint32_t mask; |
| |
| /* 41-bit instruction stored somewhere in the lowest 48 bits */ |
| uint64_t instr; |
| |
| /* Instruction normalized with bit_res for easier manipulation */ |
| uint64_t norm; |
| |
| for (i = 0; i + 16 <= size; i += 16) { |
| mask = branch_table[buf[i] & 0x1F]; |
| for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { |
| if (((mask >> slot) & 1) == 0) |
| continue; |
| |
| byte_pos = bit_pos >> 3; |
| bit_res = bit_pos & 7; |
| instr = 0; |
| for (j = 0; j < 6; ++j) |
| instr |= (uint64_t)(buf[i + j + byte_pos]) |
| << (8 * j); |
| |
| norm = instr >> bit_res; |
| |
| if (((norm >> 37) & 0x0F) == 0x05 |
| && ((norm >> 9) & 0x07) == 0) { |
| addr = (norm >> 13) & 0x0FFFFF; |
| addr |= ((uint32_t)(norm >> 36) & 1) << 20; |
| addr <<= 4; |
| addr -= s->pos + (uint32_t)i; |
| addr >>= 4; |
| |
| norm &= ~((uint64_t)0x8FFFFF << 13); |
| norm |= (uint64_t)(addr & 0x0FFFFF) << 13; |
| norm |= (uint64_t)(addr & 0x100000) |
| << (36 - 20); |
| |
| instr &= (1 << bit_res) - 1; |
| instr |= norm << bit_res; |
| |
| for (j = 0; j < 6; j++) |
| buf[i + j + byte_pos] |
| = (uint8_t)(instr >> (8 * j)); |
| } |
| } |
| } |
| |
| return i; |
| } |
| #endif |
| |
| #ifdef XZ_DEC_ARM |
| static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| size_t i; |
| uint32_t addr; |
| |
| for (i = 0; i + 4 <= size; i += 4) { |
| if (buf[i + 3] == 0xEB) { |
| addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) |
| | ((uint32_t)buf[i + 2] << 16); |
| addr <<= 2; |
| addr -= s->pos + (uint32_t)i + 8; |
| addr >>= 2; |
| buf[i] = (uint8_t)addr; |
| buf[i + 1] = (uint8_t)(addr >> 8); |
| buf[i + 2] = (uint8_t)(addr >> 16); |
| } |
| } |
| |
| return i; |
| } |
| #endif |
| |
| #ifdef XZ_DEC_ARMTHUMB |
| static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| size_t i; |
| uint32_t addr; |
| |
| for (i = 0; i + 4 <= size; i += 2) { |
| if ((buf[i + 1] & 0xF8) == 0xF0 |
| && (buf[i + 3] & 0xF8) == 0xF8) { |
| addr = (((uint32_t)buf[i + 1] & 0x07) << 19) |
| | ((uint32_t)buf[i] << 11) |
| | (((uint32_t)buf[i + 3] & 0x07) << 8) |
| | (uint32_t)buf[i + 2]; |
| addr <<= 1; |
| addr -= s->pos + (uint32_t)i + 4; |
| addr >>= 1; |
| buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07)); |
| buf[i] = (uint8_t)(addr >> 11); |
| buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07)); |
| buf[i + 2] = (uint8_t)addr; |
| i += 2; |
| } |
| } |
| |
| return i; |
| } |
| #endif |
| |
| #ifdef XZ_DEC_SPARC |
| static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| size_t i; |
| uint32_t instr; |
| |
| for (i = 0; i + 4 <= size; i += 4) { |
| instr = get_unaligned_be32(buf + i); |
| if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { |
| instr <<= 2; |
| instr -= s->pos + (uint32_t)i; |
| instr >>= 2; |
| instr = ((uint32_t)0x40000000 - (instr & 0x400000)) |
| | 0x40000000 | (instr & 0x3FFFFF); |
| put_unaligned_be32(instr, buf + i); |
| } |
| } |
| |
| return i; |
| } |
| #endif |
| |
| #ifdef XZ_DEC_ARM64 |
| static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) |
| { |
| size_t i; |
| uint32_t instr; |
| uint32_t addr; |
| |
| for (i = 0; i + 4 <= size; i += 4) { |
| instr = get_unaligned_le32(buf + i); |
| |
| if ((instr >> 26) == 0x25) { |
| /* BL instruction */ |
| addr = instr - ((s->pos + (uint32_t)i) >> 2); |
| instr = 0x94000000 | (addr & 0x03FFFFFF); |
| put_unaligned_le32(instr, buf + i); |
| |
| } else if ((instr & 0x9F000000) == 0x90000000) { |
| /* ADRP instruction */ |
| addr = ((instr >> 29) & 3) | ((instr >> 3) & 0x1FFFFC); |
| |
| /* Only convert values in the range +/-512 MiB. */ |
| if ((addr + 0x020000) & 0x1C0000) |
| continue; |
| |
| addr -= (s->pos + (uint32_t)i) >> 12; |
| |
| instr &= 0x9000001F; |
| instr |= (addr & 3) << 29; |
| instr |= (addr & 0x03FFFC) << 3; |
| instr |= (0U - (addr & 0x020000)) & 0xE00000; |
| |
| put_unaligned_le32(instr, buf + i); |
| } |
| } |
| |
| return i; |
| } |
| #endif |
| |
| /* |
| * Apply the selected BCJ filter. Update *pos and s->pos to match the amount |
| * of data that got filtered. |
| * |
| * NOTE: This is implemented as a switch statement to avoid using function |
| * pointers, which could be problematic in the kernel boot code, which must |
| * avoid pointers to static data (at least on x86). |
| */ |
| static void bcj_apply(struct xz_dec_bcj *s, |
| uint8_t *buf, size_t *pos, size_t size) |
| { |
| size_t filtered; |
| |
| buf += *pos; |
| size -= *pos; |
| |
| switch (s->type) { |
| #ifdef XZ_DEC_X86 |
| case BCJ_X86: |
| filtered = bcj_x86(s, buf, size); |
| break; |
| #endif |
| #ifdef XZ_DEC_POWERPC |
| case BCJ_POWERPC: |
| filtered = bcj_powerpc(s, buf, size); |
| break; |
| #endif |
| #ifdef XZ_DEC_IA64 |
| case BCJ_IA64: |
| filtered = bcj_ia64(s, buf, size); |
| break; |
| #endif |
| #ifdef XZ_DEC_ARM |
| case BCJ_ARM: |
| filtered = bcj_arm(s, buf, size); |
| break; |
| #endif |
| #ifdef XZ_DEC_ARMTHUMB |
| case BCJ_ARMTHUMB: |
| filtered = bcj_armthumb(s, buf, size); |
| break; |
| #endif |
| #ifdef XZ_DEC_SPARC |
| case BCJ_SPARC: |
| filtered = bcj_sparc(s, buf, size); |
| break; |
| #endif |
| #ifdef XZ_DEC_ARM64 |
| case BCJ_ARM64: |
| filtered = bcj_arm64(s, buf, size); |
| break; |
| #endif |
| default: |
| /* Never reached but silence compiler warnings. */ |
| filtered = 0; |
| break; |
| } |
| |
| *pos += filtered; |
| s->pos += filtered; |
| } |
| |
| /* |
| * Flush pending filtered data from temp to the output buffer. |
| * Move the remaining mixture of possibly filtered and unfiltered |
| * data to the beginning of temp. |
| */ |
| static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) |
| { |
| size_t copy_size; |
| |
| copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos); |
| memcpy(b->out + b->out_pos, s->temp.buf, copy_size); |
| b->out_pos += copy_size; |
| |
| s->temp.filtered -= copy_size; |
| s->temp.size -= copy_size; |
| memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size); |
| } |
| |
| /* |
| * The BCJ filter functions are primitive in sense that they process the |
| * data in chunks of 1-16 bytes. To hide this issue, this function does |
| * some buffering. |
| */ |
| XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, |
| struct xz_dec_lzma2 *lzma2, |
| struct xz_buf *b) |
| { |
| size_t out_start; |
| |
| /* |
| * Flush pending already filtered data to the output buffer. Return |
| * immediately if we couldn't flush everything, or if the next |
| * filter in the chain had already returned XZ_STREAM_END. |
| */ |
| if (s->temp.filtered > 0) { |
| bcj_flush(s, b); |
| if (s->temp.filtered > 0) |
| return XZ_OK; |
| |
| if (s->ret == XZ_STREAM_END) |
| return XZ_STREAM_END; |
| } |
| |
| /* |
| * If we have more output space than what is currently pending in |
| * temp, copy the unfiltered data from temp to the output buffer |
| * and try to fill the output buffer by decoding more data from the |
| * next filter in the chain. Apply the BCJ filter on the new data |
| * in the output buffer. If everything cannot be filtered, copy it |
| * to temp and rewind the output buffer position accordingly. |
| * |
| * This needs to be always run when temp.size == 0 to handle a special |
| * case where the output buffer is full and the next filter has no |
| * more output coming but hasn't returned XZ_STREAM_END yet. |
| */ |
| if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) { |
| out_start = b->out_pos; |
| memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size); |
| b->out_pos += s->temp.size; |
| |
| s->ret = xz_dec_lzma2_run(lzma2, b); |
| if (s->ret != XZ_STREAM_END |
| && (s->ret != XZ_OK || s->single_call)) |
| return s->ret; |
| |
| bcj_apply(s, b->out, &out_start, b->out_pos); |
| |
| /* |
| * As an exception, if the next filter returned XZ_STREAM_END, |
| * we can do that too, since the last few bytes that remain |
| * unfiltered are meant to remain unfiltered. |
| */ |
| if (s->ret == XZ_STREAM_END) |
| return XZ_STREAM_END; |
| |
| s->temp.size = b->out_pos - out_start; |
| b->out_pos -= s->temp.size; |
| memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size); |
| |
| /* |
| * If there wasn't enough input to the next filter to fill |
| * the output buffer with unfiltered data, there's no point |
| * to try decoding more data to temp. |
| */ |
| if (b->out_pos + s->temp.size < b->out_size) |
| return XZ_OK; |
| } |
| |
| /* |
| * We have unfiltered data in temp. If the output buffer isn't full |
| * yet, try to fill the temp buffer by decoding more data from the |
| * next filter. Apply the BCJ filter on temp. Then we hopefully can |
| * fill the actual output buffer by copying filtered data from temp. |
| * A mix of filtered and unfiltered data may be left in temp; it will |
| * be taken care on the next call to this function. |
| */ |
| if (b->out_pos < b->out_size) { |
| /* Make b->out{,_pos,_size} temporarily point to s->temp. */ |
| s->out = b->out; |
| s->out_pos = b->out_pos; |
| s->out_size = b->out_size; |
| b->out = s->temp.buf; |
| b->out_pos = s->temp.size; |
| b->out_size = sizeof(s->temp.buf); |
| |
| s->ret = xz_dec_lzma2_run(lzma2, b); |
| |
| s->temp.size = b->out_pos; |
| b->out = s->out; |
| b->out_pos = s->out_pos; |
| b->out_size = s->out_size; |
| |
| if (s->ret != XZ_OK && s->ret != XZ_STREAM_END) |
| return s->ret; |
| |
| bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size); |
| |
| /* |
| * If the next filter returned XZ_STREAM_END, we mark that |
| * everything is filtered, since the last unfiltered bytes |
| * of the stream are meant to be left as is. |
| */ |
| if (s->ret == XZ_STREAM_END) |
| s->temp.filtered = s->temp.size; |
| |
| bcj_flush(s, b); |
| if (s->temp.filtered > 0) |
| return XZ_OK; |
| } |
| |
| return s->ret; |
| } |
| |
| XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) |
| { |
| struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); |
| if (s != NULL) |
| s->single_call = single_call; |
| |
| return s; |
| } |
| |
| XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) |
| { |
| switch (id) { |
| #ifdef XZ_DEC_X86 |
| case BCJ_X86: |
| #endif |
| #ifdef XZ_DEC_POWERPC |
| case BCJ_POWERPC: |
| #endif |
| #ifdef XZ_DEC_IA64 |
| case BCJ_IA64: |
| #endif |
| #ifdef XZ_DEC_ARM |
| case BCJ_ARM: |
| #endif |
| #ifdef XZ_DEC_ARMTHUMB |
| case BCJ_ARMTHUMB: |
| #endif |
| #ifdef XZ_DEC_SPARC |
| case BCJ_SPARC: |
| #endif |
| #ifdef XZ_DEC_ARM64 |
| case BCJ_ARM64: |
| #endif |
| break; |
| |
| default: |
| /* Unsupported Filter ID */ |
| return XZ_OPTIONS_ERROR; |
| } |
| |
| s->type = id; |
| s->ret = XZ_OK; |
| s->pos = 0; |
| s->x86_prev_mask = 0; |
| s->temp.filtered = 0; |
| s->temp.size = 0; |
| |
| return XZ_OK; |
| } |
| |
| #endif |