| #!/usr/bin/env bash |
| # |
| # american fuzzy lop++ - corpus minimization tool |
| # --------------------------------------------- |
| # |
| # Originally written by Michal Zalewski |
| # |
| # Copyright 2014, 2015 Google Inc. All rights reserved. |
| # |
| # Copyright 2019-2024 AFLplusplus |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at: |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # This tool tries to find the smallest subset of files in the input directory |
| # that still trigger the full range of instrumentation data points seen in |
| # the starting corpus. This has two uses: |
| # |
| # - Screening large corpora of input files before using them as a seed for |
| # afl-fuzz. The tool will remove functionally redundant files and likely |
| # leave you with a much smaller set. |
| # |
| # (In this case, you probably also want to consider running afl-tmin on |
| # the individual files later on to reduce their size.) |
| # |
| # - Minimizing the corpus generated organically by afl-fuzz, perhaps when |
| # planning to feed it to more resource-intensive tools. The tool achieves |
| # this by removing all entries that used to trigger unique behaviors in the |
| # past, but have been made obsolete by later finds. |
| # |
| # Note that the tool doesn't modify the files themselves. For that, you want |
| # afl-tmin. |
| # |
| # This script must use bash because other shells may have hardcoded limits on |
| # array sizes. |
| # |
| |
| echo "corpus minimization tool for afl-fuzz" |
| echo |
| |
| ######### |
| # SETUP # |
| ######### |
| |
| # Process command-line options... |
| |
| MEM_LIMIT=none |
| TIMEOUT=5000 |
| |
| unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN F_ARG \ |
| AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE T_ARG |
| |
| export AFL_QUIET=1 |
| |
| while getopts "+i:o:f:m:t:T:eOQUAChXY" opt; do |
| |
| case "$opt" in |
| |
| "h") |
| ;; |
| |
| "i") |
| IN_DIR="$OPTARG" |
| ;; |
| |
| "o") |
| OUT_DIR="$OPTARG" |
| ;; |
| "f") |
| STDIN_FILE="$OPTARG" |
| F_ARG=1 |
| ;; |
| "m") |
| MEM_LIMIT="$OPTARG" |
| MEM_LIMIT_GIVEN=1 |
| ;; |
| "t") |
| TIMEOUT="$OPTARG" |
| ;; |
| "e") |
| EXTRA_PAR="$EXTRA_PAR -e" |
| ;; |
| "A") |
| export AFL_CMIN_ALLOW_ANY=1 |
| ;; |
| "C") |
| export AFL_CMIN_CRASHES_ONLY=1 |
| ;; |
| "O") |
| EXTRA_PAR="$EXTRA_PAR -O" |
| FRIDA_MODE=1 |
| ;; |
| "Q") |
| EXTRA_PAR="$EXTRA_PAR -Q" |
| QEMU_MODE=1 |
| ;; |
| "Y") |
| EXTRA_PAR="$EXTRA_PAR -X" |
| NYX_MODE=1 |
| ;; |
| "X") |
| EXTRA_PAR="$EXTRA_PAR -X" |
| NYX_MODE=1 |
| ;; |
| "U") |
| EXTRA_PAR="$EXTRA_PAR -U" |
| UNICORN_MODE=1 |
| ;; |
| "T") |
| T_ARG="$OPTARG" |
| ;; |
| "?") |
| exit 1 |
| ;; |
| |
| esac |
| |
| done |
| |
| shift $((OPTIND-1)) |
| |
| TARGET_BIN="$1" |
| |
| if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then |
| |
| cat 1>&2 <<_EOF_ |
| Usage: $0 [ options ] -- /path/to/target_app [ ... ] |
| |
| Required parameters: |
| |
| -i dir - input directory with the starting corpus |
| -o dir - output directory for minimized files |
| |
| Execution control settings: |
| |
| -T tasks - how many parallel processes to create (default=1, "all"=nproc) |
| -f file - location read by the fuzzed program (default: stdin) |
| -m megs - memory limit for child process (default=$MEM_LIMIT MB) |
| -t msec - run time limit for child process (default: 5000ms) |
| -O - use binary-only instrumentation (FRIDA mode) |
| -Q - use binary-only instrumentation (QEMU mode) |
| -U - use unicorn-based instrumentation (Unicorn mode) |
| -X - use Nyx mode |
| |
| Minimization settings: |
| |
| -A - allow crashing and timeout inputs |
| -C - keep crashing inputs, reject everything else |
| -e - solve for edge coverage only, ignore hit counts |
| |
| For additional tips, please consult README.md. |
| |
| Environment variables used: |
| AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory |
| AFL_NO_FORKSRV: run target via execve instead of using the forkserver |
| AFL_PATH: last resort location to find the afl-showmap binary |
| AFL_SKIP_BIN_CHECK: skip check for target binary |
| AFL_CUSTOM_MUTATOR_LIBRARY: custom mutator library (post_process and send) |
| AFL_PYTHON_MODULE: custom mutator library (post_process and send) |
| _EOF_ |
| exit 1 |
| fi |
| |
| # Do a sanity check to discourage the use of /tmp, since we can't really |
| # handle this safely from a shell script. |
| |
| if [ "$AFL_ALLOW_TMP" = "" ]; then |
| |
| echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' |
| T1="$?" |
| |
| echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' |
| T2="$?" |
| |
| echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' |
| T3="$?" |
| |
| echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' |
| T4="$?" |
| |
| echo "$PWD" | grep -qE '^(/var)?/tmp/' |
| T5="$?" |
| |
| if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then |
| echo "[-] Warning: do not use this script in /tmp or /var/tmp for security reasons." 1>&2 |
| fi |
| |
| fi |
| |
| # If @@ is specified, but there's no -f, let's come up with a temporary input |
| # file name. |
| |
| TRACE_DIR="$OUT_DIR/.traces" |
| |
| if [ "$STDIN_FILE" = "" ]; then |
| |
| if echo "$*" | grep -qF '@@'; then |
| STDIN_FILE="$TRACE_DIR/.cur_input" |
| fi |
| |
| fi |
| |
| # Check for obvious errors. |
| |
| if [ ! "$T_ARG" = "" -a -n "$F_ARG" -a ! "$NYX_MODE" == 1 ]; then |
| echo "[-] Error: -T and -f can not be used together." 1>&2 |
| exit 1 |
| fi |
| |
| if [ ! "$MEM_LIMIT" = "none" ]; then |
| |
| if [ "$MEM_LIMIT" -lt "5" ]; then |
| echo "[-] Error: dangerously low memory limit." 1>&2 |
| exit 1 |
| fi |
| |
| fi |
| |
| if [ ! "$TIMEOUT" = "none" ]; then |
| |
| if [ "$TIMEOUT" -lt "10" ]; then |
| echo "[-] Error: dangerously low timeout." 1>&2 |
| exit 1 |
| fi |
| |
| fi |
| |
| if [ "$NYX_MODE" = "" ]; then |
| if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then |
| |
| TNEW="`which "$TARGET_BIN" 2>/dev/null`" |
| |
| if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then |
| echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 |
| exit 1 |
| fi |
| |
| TARGET_BIN="$TNEW" |
| |
| fi |
| |
| fi |
| |
| grep -aq AFL_DUMP_MAP_SIZE "$TARGET_BIN" && { |
| echo "[!] Trying to obtain the map size of the target ..." |
| MAPSIZE=`AFL_DUMP_MAP_SIZE=1 "./$TARGET_BIN" 2>/dev/null` |
| test -n "$MAPSIZE" && { |
| export AFL_MAP_SIZE=$MAPSIZE |
| echo "[+] Setting AFL_MAP_SIZE=$MAPSIZE" |
| } |
| } |
| |
| if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" -a "$NYX_MODE" = "" ]; then |
| |
| if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then |
| echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 |
| exit 1 |
| fi |
| |
| fi |
| |
| if [ ! -d "$IN_DIR" ]; then |
| echo "[-] Error: directory '$IN_DIR' not found." 1>&2 |
| exit 1 |
| fi |
| |
| test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default" |
| test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" |
| |
| find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null |
| rm -rf "$TRACE_DIR" 2>/dev/null |
| |
| rmdir "$OUT_DIR" 2>/dev/null |
| |
| if [ -d "$OUT_DIR" ]; then |
| echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 |
| exit 1 |
| fi |
| |
| mkdir -m 700 -p "$TRACE_DIR" || exit 1 |
| |
| if [ ! "$STDIN_FILE" = "" ]; then |
| rm -f "$STDIN_FILE" || exit 1 |
| touch "$STDIN_FILE" || exit 1 |
| fi |
| |
| SHOWMAP=`command -v afl-showmap 2>/dev/null` |
| |
| if [ -z "$SHOWMAP" ]; then |
| TMP="${0%/afl-cmin.bash}/afl-showmap" |
| if [ -x "$TMP" ]; then |
| SHOWMAP=$TMP |
| fi |
| fi |
| |
| if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then |
| SHOWMAP="./afl-showmap" |
| else |
| if [ -n "$AFL_PATH" ]; then |
| SHOWMAP="$AFL_PATH/afl-showmap" |
| fi |
| fi |
| |
| if [ ! -x "$SHOWMAP" ]; then |
| echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 |
| rm -rf "$TRACE_DIR" |
| exit 1 |
| fi |
| |
| THREADS= |
| if [ ! "$T_ARG" = "" ]; then |
| if [ "$T_ARG" = "all" ]; then |
| THREADS=$(nproc) |
| else |
| if [ "$T_ARG" -gt 1 -a "$T_ARG" -le "$(nproc)" ]; then |
| THREADS=$T_ARG |
| else |
| echo "[-] Error: -T parameter must between 2 and $(nproc) or \"all\"." 1>&2 |
| fi |
| fi |
| else |
| if [ -z "$F_ARG" ]; then |
| echo "[*] Are you aware of the '-T all' parallelize option that massively improves the speed?" |
| fi |
| fi |
| |
| IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) |
| |
| if [ "$IN_COUNT" = "0" ]; then |
| echo "[-] Hmm, no inputs in the target directory. Nothing to be done." |
| rm -rf "$TRACE_DIR" |
| exit 1 |
| fi |
| |
| echo "[*] Are you aware that afl-cmin is faster than this afl-cmin.bash script?" |
| echo "[+] Found $IN_COUNT files for minimizing." |
| |
| if [ -n "$THREADS" ]; then |
| if [ "$IN_COUNT" -lt "$THREADS" ]; then |
| THREADS=$IN_COUNT |
| echo "[!] WARNING: less inputs than threads, reducing threads to $THREADS and likely the overhead of threading makes things slower..." |
| fi |
| fi |
| |
| FIRST_FILE=`ls "$IN_DIR" | head -1` |
| |
| # Make sure that we're not dealing with a directory. |
| |
| if [ -d "$IN_DIR/$FIRST_FILE" ]; then |
| echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 |
| rm -rf "$TRACE_DIR" |
| exit 1 |
| fi |
| |
| # Check for the more efficient way to copy files... |
| |
| if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then |
| CP_TOOL=ln |
| else |
| CP_TOOL=cp |
| fi |
| |
| # Make sure that we can actually get anything out of afl-showmap before we |
| # waste too much time. |
| |
| echo "[*] Testing the target binary..." |
| |
| if [ "$STDIN_FILE" = "" ]; then |
| |
| AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" |
| |
| else |
| |
| cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" |
| AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null |
| |
| fi |
| |
| FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`)) |
| |
| if [ "$FIRST_COUNT" -gt "0" ]; then |
| |
| echo "[+] OK, $FIRST_COUNT tuples recorded." |
| |
| else |
| |
| echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2 |
| test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" |
| exit 1 |
| |
| fi |
| |
| TMPFILE=$OUT_DIR/.list.$$ |
| if [ ! "$THREADS" = "" ]; then |
| ls -- "$IN_DIR" > $TMPFILE 2>/dev/null |
| IN_COUNT=$(cat $TMPFILE | wc -l) |
| SPLIT=$(($IN_COUNT / $THREADS)) |
| if [ "$(($IN_COUNT % $THREADS))" -gt 0 ]; then |
| SPLIT=$(($SPLIT + 1)) |
| fi |
| echo "[+] Splitting workload into $THREADS tasks with $SPLIT items on average each." |
| split -l $SPLIT $TMPFILE $TMPFILE. |
| fi |
| |
| # Let's roll! |
| |
| ############################# |
| # STEP 1: COLLECTING TRACES # |
| ############################# |
| |
| echo "[*] Obtaining traces for input files in '$IN_DIR'..." |
| |
| if [ "$THREADS" = "" ]; then |
| ( |
| |
| CUR=0 |
| |
| if [ "$STDIN_FILE" = "" ]; then |
| |
| ls "$IN_DIR" | while read -r fn; do |
| |
| if [ -s "$IN_DIR/$fn" ]; then |
| |
| CUR=$((CUR+1)) |
| printf "\\r Processing file $CUR/$IN_COUNT... " |
| |
| "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" |
| |
| fi |
| |
| done |
| |
| else |
| |
| ls "$IN_DIR" | while read -r fn; do |
| |
| if [ -s "$IN_DIR/$fn" ]; then |
| |
| CUR=$((CUR+1)) |
| printf "\\r Processing file $CUR/$IN_COUNT... " |
| |
| cp "$IN_DIR/$fn" "$STDIN_FILE" |
| "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null |
| |
| fi |
| |
| done |
| |
| fi |
| |
| echo |
| |
| ) |
| |
| else |
| |
| PIDS= |
| CNT=0 |
| for inputs in $(ls ${TMPFILE}.*); do |
| |
| ( |
| |
| if [ "$STDIN_FILE" = "" ]; then |
| |
| cat $inputs | while read -r fn; do |
| |
| if [ -s "$IN_DIR/$fn" ]; then |
| |
| "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" |
| |
| fi |
| |
| done |
| |
| else |
| |
| if [ -s "$IN_DIR/$fn" ]; then |
| STDIN_FILE="$inputs.$$" |
| cat $inputs | while read -r fn; do |
| |
| cp "$IN_DIR/$fn" "$STDIN_FILE" |
| "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null |
| |
| done |
| |
| fi |
| |
| fi |
| |
| ) & |
| |
| PIDS="$PIDS $!" |
| done |
| |
| echo "[+] Waiting for running tasks IDs:$PIDS" |
| wait |
| echo "[+] all $THREADS running tasks completed." |
| rm -f ${TMPFILE}* |
| |
| #echo trace dir files: $(ls $TRACE_DIR/*|wc -l) |
| |
| fi |
| |
| |
| ########################## |
| # STEP 2: SORTING TUPLES # |
| ########################## |
| |
| # With this out of the way, we sort all tuples by popularity across all |
| # datasets. The reasoning here is that we won't be able to avoid the files |
| # that trigger unique tuples anyway, so we will want to start with them and |
| # see what's left. |
| |
| echo "[*] Sorting trace sets (this may take a while)..." |
| |
| ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \ |
| sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq" |
| |
| TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) |
| |
| echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." |
| |
| ##################################### |
| # STEP 3: SELECTING CANDIDATE FILES # |
| ##################################### |
| |
| # The next step is to find the best candidate for each tuple. The "best" |
| # part is understood simply as the smallest input that includes a particular |
| # tuple in its trace. Empirical evidence suggests that this produces smaller |
| # datasets than more involved algorithms that could be still pulled off in |
| # a shell script. |
| |
| echo "[*] Finding best candidates for each tuple..." |
| |
| CUR=0 |
| |
| ls -rS "$IN_DIR" | while read -r fn; do |
| |
| CUR=$((CUR+1)) |
| printf "\\r Processing file $CUR/$IN_COUNT... " |
| |
| sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" |
| |
| test -s "$TRACE_DIR/$fn" || echo Warning: $fn is ignored because of crashing the target |
| |
| done |
| |
| echo |
| |
| ############################## |
| # STEP 4: LOADING CANDIDATES # |
| ############################## |
| |
| # At this point, we have a file of tuple-file pairs, sorted by file size |
| # in ascending order (as a consequence of ls -rS). By doing sort keyed |
| # only by tuple (-k 1,1) and configured to output only the first line for |
| # every key (-s -u), we end up with the smallest file for each tuple. |
| |
| echo "[*] Sorting candidate list (be patient)..." |
| |
| sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ |
| sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" |
| |
| if [ ! -s "$TRACE_DIR/.candidate_script" ]; then |
| echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 |
| test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" |
| exit 1 |
| fi |
| |
| # The sed command converted the sorted list to a shell script that populates |
| # BEST_FILE[tuple]="fname". Let's load that! |
| |
| . "$TRACE_DIR/.candidate_script" |
| |
| ########################## |
| # STEP 5: WRITING OUTPUT # |
| ########################## |
| |
| # The final trick is to grab the top pick for each tuple, unless said tuple is |
| # already set due to the inclusion of an earlier candidate; and then put all |
| # tuples associated with the newly-added file to the "already have" list. The |
| # loop works from least popular tuples and toward the most common ones. |
| |
| echo "[*] Processing candidates and writing output files..." |
| |
| CUR=0 |
| |
| touch "$TRACE_DIR/.already_have" |
| |
| while read -r cnt tuple; do |
| |
| CUR=$((CUR+1)) |
| printf "\\r Processing tuple $CUR/$TUPLE_COUNT with count $cnt... " |
| |
| # If we already have this tuple, skip it. |
| |
| grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue |
| |
| FN=${BEST_FILE[tuple]} |
| |
| # echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log" |
| $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" |
| |
| if [ "$((CUR % 5))" = "0" ]; then |
| sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" |
| mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" |
| else |
| cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" |
| fi |
| |
| done <"$TRACE_DIR/.all_uniq" |
| |
| echo |
| |
| OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` |
| |
| if [ "$OUT_COUNT" = "1" ]; then |
| echo "[!] WARNING: All test cases had the same traces, check syntax!" |
| fi |
| |
| echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." |
| echo |
| |
| test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" |
| |
| exit 0 |