| /* |
| * Copyright (C) the libgit2 contributors. All rights reserved. |
| * |
| * This file is part of libgit2, distributed under the GNU GPL v2 with |
| * a Linking Exception. For full terms see the included COPYING file. |
| */ |
| |
| #include "common.h" |
| |
| #include "git2/sys/hashsig.h" |
| #include "futils.h" |
| #include "util.h" |
| |
| typedef uint32_t hashsig_t; |
| typedef uint64_t hashsig_state; |
| |
| #define HASHSIG_SCALE 100 |
| |
| #define HASHSIG_MAX_RUN 80 |
| #define HASHSIG_HASH_START 0x012345678ABCDEF0LL |
| #define HASHSIG_HASH_SHIFT 5 |
| |
| #define HASHSIG_HASH_MIX(S,CH) \ |
| (S) = ((S) << HASHSIG_HASH_SHIFT) - (S) + (hashsig_state)(CH) |
| |
| #define HASHSIG_HEAP_SIZE ((1 << 7) - 1) |
| #define HASHSIG_HEAP_MIN_SIZE 4 |
| |
| typedef int (*hashsig_cmp)(const void *a, const void *b, void *); |
| |
| typedef struct { |
| int size, asize; |
| hashsig_cmp cmp; |
| hashsig_t values[HASHSIG_HEAP_SIZE]; |
| } hashsig_heap; |
| |
| struct git_hashsig { |
| hashsig_heap mins; |
| hashsig_heap maxs; |
| size_t lines; |
| git_hashsig_option_t opt; |
| }; |
| |
| #define HEAP_LCHILD_OF(I) (((I)<<1)+1) |
| #define HEAP_RCHILD_OF(I) (((I)<<1)+2) |
| #define HEAP_PARENT_OF(I) (((I)-1)>>1) |
| |
| static void hashsig_heap_init(hashsig_heap *h, hashsig_cmp cmp) |
| { |
| h->size = 0; |
| h->asize = HASHSIG_HEAP_SIZE; |
| h->cmp = cmp; |
| } |
| |
| static int hashsig_cmp_max(const void *a, const void *b, void *payload) |
| { |
| hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; |
| GIT_UNUSED(payload); |
| return (av < bv) ? -1 : (av > bv) ? 1 : 0; |
| } |
| |
| static int hashsig_cmp_min(const void *a, const void *b, void *payload) |
| { |
| hashsig_t av = *(const hashsig_t *)a, bv = *(const hashsig_t *)b; |
| GIT_UNUSED(payload); |
| return (av > bv) ? -1 : (av < bv) ? 1 : 0; |
| } |
| |
| static void hashsig_heap_up(hashsig_heap *h, int el) |
| { |
| int parent_el = HEAP_PARENT_OF(el); |
| |
| while (el > 0 && h->cmp(&h->values[parent_el], &h->values[el], NULL) > 0) { |
| hashsig_t t = h->values[el]; |
| h->values[el] = h->values[parent_el]; |
| h->values[parent_el] = t; |
| |
| el = parent_el; |
| parent_el = HEAP_PARENT_OF(el); |
| } |
| } |
| |
| static void hashsig_heap_down(hashsig_heap *h, int el) |
| { |
| hashsig_t v, lv, rv; |
| |
| /* 'el < h->size / 2' tests if el is bottom row of heap */ |
| |
| while (el < h->size / 2) { |
| int lel = HEAP_LCHILD_OF(el), rel = HEAP_RCHILD_OF(el), swapel; |
| |
| v = h->values[el]; |
| lv = h->values[lel]; |
| rv = h->values[rel]; |
| |
| if (h->cmp(&v, &lv, NULL) < 0 && h->cmp(&v, &rv, NULL) < 0) |
| break; |
| |
| swapel = (h->cmp(&lv, &rv, NULL) < 0) ? lel : rel; |
| |
| h->values[el] = h->values[swapel]; |
| h->values[swapel] = v; |
| |
| el = swapel; |
| } |
| } |
| |
| static void hashsig_heap_sort(hashsig_heap *h) |
| { |
| /* only need to do this at the end for signature comparison */ |
| git__qsort_r(h->values, h->size, sizeof(hashsig_t), h->cmp, NULL); |
| } |
| |
| static void hashsig_heap_insert(hashsig_heap *h, hashsig_t val) |
| { |
| /* if heap is not full, insert new element */ |
| if (h->size < h->asize) { |
| h->values[h->size++] = val; |
| hashsig_heap_up(h, h->size - 1); |
| } |
| |
| /* if heap is full, pop top if new element should replace it */ |
| else if (h->cmp(&val, &h->values[0], NULL) > 0) { |
| h->size--; |
| h->values[0] = h->values[h->size]; |
| hashsig_heap_down(h, 0); |
| } |
| |
| } |
| |
| typedef struct { |
| int use_ignores; |
| uint8_t ignore_ch[256]; |
| } hashsig_in_progress; |
| |
| static void hashsig_in_progress_init( |
| hashsig_in_progress *prog, git_hashsig *sig) |
| { |
| int i; |
| |
| /* no more than one can be set */ |
| assert(!(sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) || |
| !(sig->opt & GIT_HASHSIG_SMART_WHITESPACE)); |
| |
| if (sig->opt & GIT_HASHSIG_IGNORE_WHITESPACE) { |
| for (i = 0; i < 256; ++i) |
| prog->ignore_ch[i] = git__isspace_nonlf(i); |
| prog->use_ignores = 1; |
| } else if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) { |
| for (i = 0; i < 256; ++i) |
| prog->ignore_ch[i] = git__isspace(i); |
| prog->use_ignores = 1; |
| } else { |
| memset(prog, 0, sizeof(*prog)); |
| } |
| } |
| |
| static int hashsig_add_hashes( |
| git_hashsig *sig, |
| const uint8_t *data, |
| size_t size, |
| hashsig_in_progress *prog) |
| { |
| const uint8_t *scan = data, *end = data + size; |
| hashsig_state state = HASHSIG_HASH_START; |
| int use_ignores = prog->use_ignores, len; |
| uint8_t ch; |
| |
| while (scan < end) { |
| state = HASHSIG_HASH_START; |
| |
| for (len = 0; scan < end && len < HASHSIG_MAX_RUN; ) { |
| ch = *scan; |
| |
| if (use_ignores) |
| for (; scan < end && git__isspace_nonlf(ch); ch = *scan) |
| ++scan; |
| else if (sig->opt & |
| (GIT_HASHSIG_IGNORE_WHITESPACE | GIT_HASHSIG_SMART_WHITESPACE)) |
| for (; scan < end && ch == '\r'; ch = *scan) |
| ++scan; |
| |
| /* peek at next character to decide what to do next */ |
| if (sig->opt & GIT_HASHSIG_SMART_WHITESPACE) |
| use_ignores = (ch == '\n'); |
| |
| if (scan >= end) |
| break; |
| ++scan; |
| |
| /* check run terminator */ |
| if (ch == '\n' || ch == '\0') { |
| sig->lines++; |
| break; |
| } |
| |
| ++len; |
| HASHSIG_HASH_MIX(state, ch); |
| } |
| |
| if (len > 0) { |
| hashsig_heap_insert(&sig->mins, (hashsig_t)state); |
| hashsig_heap_insert(&sig->maxs, (hashsig_t)state); |
| |
| while (scan < end && (*scan == '\n' || !*scan)) |
| ++scan; |
| } |
| } |
| |
| prog->use_ignores = use_ignores; |
| |
| return 0; |
| } |
| |
| static int hashsig_finalize_hashes(git_hashsig *sig) |
| { |
| if (sig->mins.size < HASHSIG_HEAP_MIN_SIZE && |
| !(sig->opt & GIT_HASHSIG_ALLOW_SMALL_FILES)) { |
| git_error_set(GIT_ERROR_INVALID, |
| "file too small for similarity signature calculation"); |
| return GIT_EBUFS; |
| } |
| |
| hashsig_heap_sort(&sig->mins); |
| hashsig_heap_sort(&sig->maxs); |
| |
| return 0; |
| } |
| |
| static git_hashsig *hashsig_alloc(git_hashsig_option_t opts) |
| { |
| git_hashsig *sig = git__calloc(1, sizeof(git_hashsig)); |
| if (!sig) |
| return NULL; |
| |
| hashsig_heap_init(&sig->mins, hashsig_cmp_min); |
| hashsig_heap_init(&sig->maxs, hashsig_cmp_max); |
| sig->opt = opts; |
| |
| return sig; |
| } |
| |
| int git_hashsig_create( |
| git_hashsig **out, |
| const char *buf, |
| size_t buflen, |
| git_hashsig_option_t opts) |
| { |
| int error; |
| hashsig_in_progress prog; |
| git_hashsig *sig = hashsig_alloc(opts); |
| GIT_ERROR_CHECK_ALLOC(sig); |
| |
| hashsig_in_progress_init(&prog, sig); |
| |
| error = hashsig_add_hashes(sig, (const uint8_t *)buf, buflen, &prog); |
| |
| if (!error) |
| error = hashsig_finalize_hashes(sig); |
| |
| if (!error) |
| *out = sig; |
| else |
| git_hashsig_free(sig); |
| |
| return error; |
| } |
| |
| int git_hashsig_create_fromfile( |
| git_hashsig **out, |
| const char *path, |
| git_hashsig_option_t opts) |
| { |
| uint8_t buf[0x1000]; |
| ssize_t buflen = 0; |
| int error = 0, fd; |
| hashsig_in_progress prog; |
| git_hashsig *sig = hashsig_alloc(opts); |
| GIT_ERROR_CHECK_ALLOC(sig); |
| |
| if ((fd = git_futils_open_ro(path)) < 0) { |
| git__free(sig); |
| return fd; |
| } |
| |
| hashsig_in_progress_init(&prog, sig); |
| |
| while (!error) { |
| if ((buflen = p_read(fd, buf, sizeof(buf))) <= 0) { |
| if ((error = (int)buflen) < 0) |
| git_error_set(GIT_ERROR_OS, |
| "read error on '%s' calculating similarity hashes", path); |
| break; |
| } |
| |
| error = hashsig_add_hashes(sig, buf, buflen, &prog); |
| } |
| |
| p_close(fd); |
| |
| if (!error) |
| error = hashsig_finalize_hashes(sig); |
| |
| if (!error) |
| *out = sig; |
| else |
| git_hashsig_free(sig); |
| |
| return error; |
| } |
| |
| void git_hashsig_free(git_hashsig *sig) |
| { |
| git__free(sig); |
| } |
| |
| static int hashsig_heap_compare(const hashsig_heap *a, const hashsig_heap *b) |
| { |
| int matches = 0, i, j, cmp; |
| |
| assert(a->cmp == b->cmp); |
| |
| /* hash heaps are sorted - just look for overlap vs total */ |
| |
| for (i = 0, j = 0; i < a->size && j < b->size; ) { |
| cmp = a->cmp(&a->values[i], &b->values[j], NULL); |
| |
| if (cmp < 0) |
| ++i; |
| else if (cmp > 0) |
| ++j; |
| else { |
| ++i; ++j; ++matches; |
| } |
| } |
| |
| return HASHSIG_SCALE * (matches * 2) / (a->size + b->size); |
| } |
| |
| int git_hashsig_compare(const git_hashsig *a, const git_hashsig *b) |
| { |
| /* if we have no elements in either file then each file is either |
| * empty or blank. if we're ignoring whitespace then the files are |
| * similar, otherwise they're dissimilar. |
| */ |
| if (a->mins.size == 0 && b->mins.size == 0) { |
| if ((!a->lines && !b->lines) || |
| (a->opt & GIT_HASHSIG_IGNORE_WHITESPACE)) |
| return HASHSIG_SCALE; |
| else |
| return 0; |
| } |
| |
| /* if we have fewer than the maximum number of elements, then just use |
| * one array since the two arrays will be the same |
| */ |
| if (a->mins.size < HASHSIG_HEAP_SIZE) |
| return hashsig_heap_compare(&a->mins, &b->mins); |
| else |
| return (hashsig_heap_compare(&a->mins, &b->mins) + |
| hashsig_heap_compare(&a->maxs, &b->maxs)) / 2; |
| } |