| /* |
| * Copyright (C) the libgit2 contributors. All rights reserved. |
| * |
| * This file is part of libgit2, distributed under the GNU GPL v2 with |
| * a Linking Exception. For full terms see the included COPYING file. |
| */ |
| |
| #include "diff_tform.h" |
| |
| #include "git2/config.h" |
| #include "git2/blob.h" |
| #include "git2/sys/hashsig.h" |
| |
| #include "diff.h" |
| #include "diff_generate.h" |
| #include "path.h" |
| #include "fileops.h" |
| #include "config.h" |
| |
| git_diff_delta *git_diff__delta_dup( |
| const git_diff_delta *d, git_pool *pool) |
| { |
| git_diff_delta *delta = git__malloc(sizeof(git_diff_delta)); |
| if (!delta) |
| return NULL; |
| |
| memcpy(delta, d, sizeof(git_diff_delta)); |
| GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags); |
| |
| if (d->old_file.path != NULL) { |
| delta->old_file.path = git_pool_strdup(pool, d->old_file.path); |
| if (delta->old_file.path == NULL) |
| goto fail; |
| } |
| |
| if (d->new_file.path != d->old_file.path && d->new_file.path != NULL) { |
| delta->new_file.path = git_pool_strdup(pool, d->new_file.path); |
| if (delta->new_file.path == NULL) |
| goto fail; |
| } else { |
| delta->new_file.path = delta->old_file.path; |
| } |
| |
| return delta; |
| |
| fail: |
| git__free(delta); |
| return NULL; |
| } |
| |
| git_diff_delta *git_diff__merge_like_cgit( |
| const git_diff_delta *a, |
| const git_diff_delta *b, |
| git_pool *pool) |
| { |
| git_diff_delta *dup; |
| |
| /* Emulate C git for merging two diffs (a la 'git diff <sha>'). |
| * |
| * When C git does a diff between the work dir and a tree, it actually |
| * diffs with the index but uses the workdir contents. This emulates |
| * those choices so we can emulate the type of diff. |
| * |
| * We have three file descriptions here, let's call them: |
| * f1 = a->old_file |
| * f2 = a->new_file AND b->old_file |
| * f3 = b->new_file |
| */ |
| |
| /* If one of the diffs is a conflict, just dup it */ |
| if (b->status == GIT_DELTA_CONFLICTED) |
| return git_diff__delta_dup(b, pool); |
| if (a->status == GIT_DELTA_CONFLICTED) |
| return git_diff__delta_dup(a, pool); |
| |
| /* if f2 == f3 or f2 is deleted, then just dup the 'a' diff */ |
| if (b->status == GIT_DELTA_UNMODIFIED || a->status == GIT_DELTA_DELETED) |
| return git_diff__delta_dup(a, pool); |
| |
| /* otherwise, base this diff on the 'b' diff */ |
| if ((dup = git_diff__delta_dup(b, pool)) == NULL) |
| return NULL; |
| |
| /* If 'a' status is uninteresting, then we're done */ |
| if (a->status == GIT_DELTA_UNMODIFIED || |
| a->status == GIT_DELTA_UNTRACKED || |
| a->status == GIT_DELTA_UNREADABLE) |
| return dup; |
| |
| assert(b->status != GIT_DELTA_UNMODIFIED); |
| |
| /* A cgit exception is that the diff of a file that is only in the |
| * index (i.e. not in HEAD nor workdir) is given as empty. |
| */ |
| if (dup->status == GIT_DELTA_DELETED) { |
| if (a->status == GIT_DELTA_ADDED) { |
| dup->status = GIT_DELTA_UNMODIFIED; |
| dup->nfiles = 2; |
| } |
| /* else don't overwrite DELETE status */ |
| } else { |
| dup->status = a->status; |
| dup->nfiles = a->nfiles; |
| } |
| |
| git_oid_cpy(&dup->old_file.id, &a->old_file.id); |
| dup->old_file.mode = a->old_file.mode; |
| dup->old_file.size = a->old_file.size; |
| dup->old_file.flags = a->old_file.flags; |
| |
| return dup; |
| } |
| |
| int git_diff__merge( |
| git_diff *onto, const git_diff *from, git_diff__merge_cb cb) |
| { |
| int error = 0; |
| git_pool onto_pool; |
| git_vector onto_new; |
| git_diff_delta *delta; |
| bool ignore_case, reversed; |
| unsigned int i, j; |
| |
| assert(onto && from); |
| |
| if (!from->deltas.length) |
| return 0; |
| |
| ignore_case = ((onto->opts.flags & GIT_DIFF_IGNORE_CASE) != 0); |
| reversed = ((onto->opts.flags & GIT_DIFF_REVERSE) != 0); |
| |
| if (ignore_case != ((from->opts.flags & GIT_DIFF_IGNORE_CASE) != 0) || |
| reversed != ((from->opts.flags & GIT_DIFF_REVERSE) != 0)) { |
| git_error_set(GIT_ERROR_INVALID, |
| "attempt to merge diffs created with conflicting options"); |
| return -1; |
| } |
| |
| if (git_vector_init(&onto_new, onto->deltas.length, git_diff_delta__cmp) < 0) |
| return -1; |
| |
| git_pool_init(&onto_pool, 1); |
| |
| for (i = 0, j = 0; i < onto->deltas.length || j < from->deltas.length; ) { |
| git_diff_delta *o = GIT_VECTOR_GET(&onto->deltas, i); |
| const git_diff_delta *f = GIT_VECTOR_GET(&from->deltas, j); |
| int cmp = !f ? -1 : !o ? 1 : |
| STRCMP_CASESELECT(ignore_case, o->old_file.path, f->old_file.path); |
| |
| if (cmp < 0) { |
| delta = git_diff__delta_dup(o, &onto_pool); |
| i++; |
| } else if (cmp > 0) { |
| delta = git_diff__delta_dup(f, &onto_pool); |
| j++; |
| } else { |
| const git_diff_delta *left = reversed ? f : o; |
| const git_diff_delta *right = reversed ? o : f; |
| |
| delta = cb(left, right, &onto_pool); |
| i++; |
| j++; |
| } |
| |
| /* the ignore rules for the target may not match the source |
| * or the result of a merged delta could be skippable... |
| */ |
| if (delta && git_diff_delta__should_skip(&onto->opts, delta)) { |
| git__free(delta); |
| continue; |
| } |
| |
| if ((error = !delta ? -1 : git_vector_insert(&onto_new, delta)) < 0) |
| break; |
| } |
| |
| if (!error) { |
| git_vector_swap(&onto->deltas, &onto_new); |
| git_pool_swap(&onto->pool, &onto_pool); |
| |
| if ((onto->opts.flags & GIT_DIFF_REVERSE) != 0) |
| onto->old_src = from->old_src; |
| else |
| onto->new_src = from->new_src; |
| |
| /* prefix strings also come from old pool, so recreate those.*/ |
| onto->opts.old_prefix = |
| git_pool_strdup_safe(&onto->pool, onto->opts.old_prefix); |
| onto->opts.new_prefix = |
| git_pool_strdup_safe(&onto->pool, onto->opts.new_prefix); |
| } |
| |
| git_vector_free_deep(&onto_new); |
| git_pool_clear(&onto_pool); |
| |
| return error; |
| } |
| |
| int git_diff_merge(git_diff *onto, const git_diff *from) |
| { |
| return git_diff__merge(onto, from, git_diff__merge_like_cgit); |
| } |
| |
| int git_diff_find_similar__hashsig_for_file( |
| void **out, const git_diff_file *f, const char *path, void *p) |
| { |
| git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p; |
| |
| GIT_UNUSED(f); |
| return git_hashsig_create_fromfile((git_hashsig **)out, path, opt); |
| } |
| |
| int git_diff_find_similar__hashsig_for_buf( |
| void **out, const git_diff_file *f, const char *buf, size_t len, void *p) |
| { |
| git_hashsig_option_t opt = (git_hashsig_option_t)(intptr_t)p; |
| |
| GIT_UNUSED(f); |
| return git_hashsig_create((git_hashsig **)out, buf, len, opt); |
| } |
| |
| void git_diff_find_similar__hashsig_free(void *sig, void *payload) |
| { |
| GIT_UNUSED(payload); |
| git_hashsig_free(sig); |
| } |
| |
| int git_diff_find_similar__calc_similarity( |
| int *score, void *siga, void *sigb, void *payload) |
| { |
| int error; |
| |
| GIT_UNUSED(payload); |
| error = git_hashsig_compare(siga, sigb); |
| if (error < 0) |
| return error; |
| |
| *score = error; |
| return 0; |
| } |
| |
| #define DEFAULT_THRESHOLD 50 |
| #define DEFAULT_BREAK_REWRITE_THRESHOLD 60 |
| #define DEFAULT_RENAME_LIMIT 200 |
| |
| static int normalize_find_opts( |
| git_diff *diff, |
| git_diff_find_options *opts, |
| const git_diff_find_options *given) |
| { |
| git_config *cfg = NULL; |
| git_hashsig_option_t hashsig_opts; |
| |
| GIT_ERROR_CHECK_VERSION(given, GIT_DIFF_FIND_OPTIONS_VERSION, "git_diff_find_options"); |
| |
| if (diff->repo != NULL && |
| git_repository_config__weakptr(&cfg, diff->repo) < 0) |
| return -1; |
| |
| if (given) |
| memcpy(opts, given, sizeof(*opts)); |
| |
| if (!given || |
| (given->flags & GIT_DIFF_FIND_ALL) == GIT_DIFF_FIND_BY_CONFIG) |
| { |
| if (cfg) { |
| char *rule = |
| git_config__get_string_force(cfg, "diff.renames", "true"); |
| int boolval; |
| |
| if (!git__parse_bool(&boolval, rule) && !boolval) |
| /* don't set FIND_RENAMES if bool value is false */; |
| else if (!strcasecmp(rule, "copies") || !strcasecmp(rule, "copy")) |
| opts->flags |= GIT_DIFF_FIND_RENAMES | GIT_DIFF_FIND_COPIES; |
| else |
| opts->flags |= GIT_DIFF_FIND_RENAMES; |
| |
| git__free(rule); |
| } else { |
| /* set default flag */ |
| opts->flags |= GIT_DIFF_FIND_RENAMES; |
| } |
| } |
| |
| /* some flags imply others */ |
| |
| if (opts->flags & GIT_DIFF_FIND_EXACT_MATCH_ONLY) { |
| /* if we are only looking for exact matches, then don't turn |
| * MODIFIED items into ADD/DELETE pairs because it's too picky |
| */ |
| opts->flags &= ~(GIT_DIFF_FIND_REWRITES | GIT_DIFF_BREAK_REWRITES); |
| |
| /* similarly, don't look for self-rewrites to split */ |
| opts->flags &= ~GIT_DIFF_FIND_RENAMES_FROM_REWRITES; |
| } |
| |
| if (opts->flags & GIT_DIFF_FIND_RENAMES_FROM_REWRITES) |
| opts->flags |= GIT_DIFF_FIND_RENAMES; |
| |
| if (opts->flags & GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED) |
| opts->flags |= GIT_DIFF_FIND_COPIES; |
| |
| if (opts->flags & GIT_DIFF_BREAK_REWRITES) |
| opts->flags |= GIT_DIFF_FIND_REWRITES; |
| |
| #define USE_DEFAULT(X) ((X) == 0 || (X) > 100) |
| |
| if (USE_DEFAULT(opts->rename_threshold)) |
| opts->rename_threshold = DEFAULT_THRESHOLD; |
| |
| if (USE_DEFAULT(opts->rename_from_rewrite_threshold)) |
| opts->rename_from_rewrite_threshold = DEFAULT_THRESHOLD; |
| |
| if (USE_DEFAULT(opts->copy_threshold)) |
| opts->copy_threshold = DEFAULT_THRESHOLD; |
| |
| if (USE_DEFAULT(opts->break_rewrite_threshold)) |
| opts->break_rewrite_threshold = DEFAULT_BREAK_REWRITE_THRESHOLD; |
| |
| #undef USE_DEFAULT |
| |
| if (!opts->rename_limit) { |
| if (cfg) { |
| opts->rename_limit = git_config__get_int_force( |
| cfg, "diff.renamelimit", DEFAULT_RENAME_LIMIT); |
| } |
| |
| if (opts->rename_limit <= 0) |
| opts->rename_limit = DEFAULT_RENAME_LIMIT; |
| } |
| |
| /* assign the internal metric with whitespace flag as payload */ |
| if (!opts->metric) { |
| opts->metric = git__malloc(sizeof(git_diff_similarity_metric)); |
| GIT_ERROR_CHECK_ALLOC(opts->metric); |
| |
| opts->metric->file_signature = git_diff_find_similar__hashsig_for_file; |
| opts->metric->buffer_signature = git_diff_find_similar__hashsig_for_buf; |
| opts->metric->free_signature = git_diff_find_similar__hashsig_free; |
| opts->metric->similarity = git_diff_find_similar__calc_similarity; |
| |
| if (opts->flags & GIT_DIFF_FIND_IGNORE_WHITESPACE) |
| hashsig_opts = GIT_HASHSIG_IGNORE_WHITESPACE; |
| else if (opts->flags & GIT_DIFF_FIND_DONT_IGNORE_WHITESPACE) |
| hashsig_opts = GIT_HASHSIG_NORMAL; |
| else |
| hashsig_opts = GIT_HASHSIG_SMART_WHITESPACE; |
| hashsig_opts |= GIT_HASHSIG_ALLOW_SMALL_FILES; |
| opts->metric->payload = (void *)hashsig_opts; |
| } |
| |
| return 0; |
| } |
| |
| static int insert_delete_side_of_split( |
| git_diff *diff, git_vector *onto, const git_diff_delta *delta) |
| { |
| /* make new record for DELETED side of split */ |
| git_diff_delta *deleted = git_diff__delta_dup(delta, &diff->pool); |
| GIT_ERROR_CHECK_ALLOC(deleted); |
| |
| deleted->status = GIT_DELTA_DELETED; |
| deleted->nfiles = 1; |
| memset(&deleted->new_file, 0, sizeof(deleted->new_file)); |
| deleted->new_file.path = deleted->old_file.path; |
| deleted->new_file.flags |= GIT_DIFF_FLAG_VALID_ID; |
| |
| return git_vector_insert(onto, deleted); |
| } |
| |
| static int apply_splits_and_deletes( |
| git_diff *diff, size_t expected_size, bool actually_split) |
| { |
| git_vector onto = GIT_VECTOR_INIT; |
| size_t i; |
| git_diff_delta *delta; |
| |
| if (git_vector_init(&onto, expected_size, git_diff_delta__cmp) < 0) |
| return -1; |
| |
| /* build new delta list without TO_DELETE and splitting TO_SPLIT */ |
| git_vector_foreach(&diff->deltas, i, delta) { |
| if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0) |
| continue; |
| |
| if ((delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0 && actually_split) { |
| delta->similarity = 0; |
| |
| if (insert_delete_side_of_split(diff, &onto, delta) < 0) |
| goto on_error; |
| |
| if (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) |
| delta->status = GIT_DELTA_UNTRACKED; |
| else |
| delta->status = GIT_DELTA_ADDED; |
| delta->nfiles = 1; |
| memset(&delta->old_file, 0, sizeof(delta->old_file)); |
| delta->old_file.path = delta->new_file.path; |
| delta->old_file.flags |= GIT_DIFF_FLAG_VALID_ID; |
| } |
| |
| /* clean up delta before inserting into new list */ |
| GIT_DIFF_FLAG__CLEAR_INTERNAL(delta->flags); |
| |
| if (delta->status != GIT_DELTA_COPIED && |
| delta->status != GIT_DELTA_RENAMED && |
| (delta->status != GIT_DELTA_MODIFIED || actually_split)) |
| delta->similarity = 0; |
| |
| /* insert into new list */ |
| if (git_vector_insert(&onto, delta) < 0) |
| goto on_error; |
| } |
| |
| /* cannot return an error past this point */ |
| |
| /* free deltas from old list that didn't make it to the new one */ |
| git_vector_foreach(&diff->deltas, i, delta) { |
| if ((delta->flags & GIT_DIFF_FLAG__TO_DELETE) != 0) |
| git__free(delta); |
| } |
| |
| /* swap new delta list into place */ |
| git_vector_swap(&diff->deltas, &onto); |
| git_vector_free(&onto); |
| git_vector_sort(&diff->deltas); |
| |
| return 0; |
| |
| on_error: |
| git_vector_free_deep(&onto); |
| |
| return -1; |
| } |
| |
| GIT_INLINE(git_diff_file *) similarity_get_file(git_diff *diff, size_t idx) |
| { |
| git_diff_delta *delta = git_vector_get(&diff->deltas, idx / 2); |
| return (idx & 1) ? &delta->new_file : &delta->old_file; |
| } |
| |
| typedef struct { |
| size_t idx; |
| git_iterator_type_t src; |
| git_repository *repo; |
| git_diff_file *file; |
| git_buf data; |
| git_odb_object *odb_obj; |
| git_blob *blob; |
| } similarity_info; |
| |
| static int similarity_init( |
| similarity_info *info, git_diff *diff, size_t file_idx) |
| { |
| info->idx = file_idx; |
| info->src = (file_idx & 1) ? diff->new_src : diff->old_src; |
| info->repo = diff->repo; |
| info->file = similarity_get_file(diff, file_idx); |
| info->odb_obj = NULL; |
| info->blob = NULL; |
| git_buf_init(&info->data, 0); |
| |
| if (info->file->size > 0 || info->src == GIT_ITERATOR_TYPE_WORKDIR) |
| return 0; |
| |
| return git_diff_file__resolve_zero_size( |
| info->file, &info->odb_obj, info->repo); |
| } |
| |
| static int similarity_sig( |
| similarity_info *info, |
| const git_diff_find_options *opts, |
| void **cache) |
| { |
| int error = 0; |
| git_diff_file *file = info->file; |
| |
| if (info->src == GIT_ITERATOR_TYPE_WORKDIR) { |
| if ((error = git_buf_joinpath( |
| &info->data, git_repository_workdir(info->repo), file->path)) < 0) |
| return error; |
| |
| /* if path is not a regular file, just skip this item */ |
| if (!git_path_isfile(info->data.ptr)) |
| return 0; |
| |
| /* TODO: apply wd-to-odb filters to file data if necessary */ |
| |
| error = opts->metric->file_signature( |
| &cache[info->idx], info->file, |
| info->data.ptr, opts->metric->payload); |
| } else { |
| /* if we didn't initially know the size, we might have an odb_obj |
| * around from earlier, so convert that, otherwise load the blob now |
| */ |
| if (info->odb_obj != NULL) |
| error = git_object__from_odb_object( |
| (git_object **)&info->blob, info->repo, |
| info->odb_obj, GIT_OBJECT_BLOB); |
| else |
| error = git_blob_lookup(&info->blob, info->repo, &file->id); |
| |
| if (error < 0) { |
| /* if lookup fails, just skip this item in similarity calc */ |
| git_error_clear(); |
| } else { |
| size_t sz; |
| |
| /* index size may not be actual blob size if filtered */ |
| if (file->size != git_blob_rawsize(info->blob)) |
| file->size = git_blob_rawsize(info->blob); |
| |
| sz = (size_t)(git__is_sizet(file->size) ? file->size : -1); |
| |
| error = opts->metric->buffer_signature( |
| &cache[info->idx], info->file, |
| git_blob_rawcontent(info->blob), sz, opts->metric->payload); |
| } |
| } |
| |
| return error; |
| } |
| |
| static void similarity_unload(similarity_info *info) |
| { |
| if (info->odb_obj) |
| git_odb_object_free(info->odb_obj); |
| |
| if (info->blob) |
| git_blob_free(info->blob); |
| else |
| git_buf_dispose(&info->data); |
| } |
| |
| #define FLAG_SET(opts,flag_name) (((opts)->flags & flag_name) != 0) |
| |
| /* - score < 0 means files cannot be compared |
| * - score >= 100 means files are exact match |
| * - score == 0 means files are completely different |
| */ |
| static int similarity_measure( |
| int *score, |
| git_diff *diff, |
| const git_diff_find_options *opts, |
| void **cache, |
| size_t a_idx, |
| size_t b_idx) |
| { |
| git_diff_file *a_file = similarity_get_file(diff, a_idx); |
| git_diff_file *b_file = similarity_get_file(diff, b_idx); |
| bool exact_match = FLAG_SET(opts, GIT_DIFF_FIND_EXACT_MATCH_ONLY); |
| int error = 0; |
| similarity_info a_info, b_info; |
| |
| *score = -1; |
| |
| /* don't try to compare things that aren't files */ |
| if (!GIT_MODE_ISBLOB(a_file->mode) || !GIT_MODE_ISBLOB(b_file->mode)) |
| return 0; |
| |
| /* if exact match is requested, force calculation of missing OIDs now */ |
| if (exact_match) { |
| if (git_oid_iszero(&a_file->id) && |
| diff->old_src == GIT_ITERATOR_TYPE_WORKDIR && |
| !git_diff__oid_for_file(&a_file->id, |
| diff, a_file->path, a_file->mode, a_file->size)) |
| a_file->flags |= GIT_DIFF_FLAG_VALID_ID; |
| |
| if (git_oid_iszero(&b_file->id) && |
| diff->new_src == GIT_ITERATOR_TYPE_WORKDIR && |
| !git_diff__oid_for_file(&b_file->id, |
| diff, b_file->path, b_file->mode, b_file->size)) |
| b_file->flags |= GIT_DIFF_FLAG_VALID_ID; |
| } |
| |
| /* check OID match as a quick test */ |
| if (git_oid__cmp(&a_file->id, &b_file->id) == 0) { |
| *score = 100; |
| return 0; |
| } |
| |
| /* don't calculate signatures if we are doing exact match */ |
| if (exact_match) { |
| *score = 0; |
| return 0; |
| } |
| |
| memset(&a_info, 0, sizeof(a_info)); |
| memset(&b_info, 0, sizeof(b_info)); |
| |
| /* set up similarity data (will try to update missing file sizes) */ |
| if (!cache[a_idx] && (error = similarity_init(&a_info, diff, a_idx)) < 0) |
| return error; |
| if (!cache[b_idx] && (error = similarity_init(&b_info, diff, b_idx)) < 0) |
| goto cleanup; |
| |
| /* check if file sizes are nowhere near each other */ |
| if (a_file->size > 127 && |
| b_file->size > 127 && |
| (a_file->size > (b_file->size << 3) || |
| b_file->size > (a_file->size << 3))) |
| goto cleanup; |
| |
| /* update signature cache if needed */ |
| if (!cache[a_idx]) { |
| if ((error = similarity_sig(&a_info, opts, cache)) < 0) |
| goto cleanup; |
| } |
| if (!cache[b_idx]) { |
| if ((error = similarity_sig(&b_info, opts, cache)) < 0) |
| goto cleanup; |
| } |
| |
| /* calculate similarity provided that the metric choose to process |
| * both the a and b files (some may not if file is too big, etc). |
| */ |
| if (cache[a_idx] && cache[b_idx]) |
| error = opts->metric->similarity( |
| score, cache[a_idx], cache[b_idx], opts->metric->payload); |
| |
| cleanup: |
| similarity_unload(&a_info); |
| similarity_unload(&b_info); |
| |
| return error; |
| } |
| |
| static int calc_self_similarity( |
| git_diff *diff, |
| const git_diff_find_options *opts, |
| size_t delta_idx, |
| void **cache) |
| { |
| int error, similarity = -1; |
| git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx); |
| |
| if ((delta->flags & GIT_DIFF_FLAG__HAS_SELF_SIMILARITY) != 0) |
| return 0; |
| |
| error = similarity_measure( |
| &similarity, diff, opts, cache, 2 * delta_idx, 2 * delta_idx + 1); |
| if (error < 0) |
| return error; |
| |
| if (similarity >= 0) { |
| delta->similarity = (uint16_t)similarity; |
| delta->flags |= GIT_DIFF_FLAG__HAS_SELF_SIMILARITY; |
| } |
| |
| return 0; |
| } |
| |
| static bool is_rename_target( |
| git_diff *diff, |
| const git_diff_find_options *opts, |
| size_t delta_idx, |
| void **cache) |
| { |
| git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx); |
| |
| /* skip things that aren't plain blobs */ |
| if (!GIT_MODE_ISBLOB(delta->new_file.mode)) |
| return false; |
| |
| /* only consider ADDED, RENAMED, COPIED, and split MODIFIED as |
| * targets; maybe include UNTRACKED if requested. |
| */ |
| switch (delta->status) { |
| case GIT_DELTA_UNMODIFIED: |
| case GIT_DELTA_DELETED: |
| case GIT_DELTA_IGNORED: |
| case GIT_DELTA_CONFLICTED: |
| return false; |
| |
| case GIT_DELTA_MODIFIED: |
| if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) && |
| !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES)) |
| return false; |
| |
| if (calc_self_similarity(diff, opts, delta_idx, cache) < 0) |
| return false; |
| |
| if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) && |
| delta->similarity < opts->break_rewrite_threshold) { |
| delta->flags |= GIT_DIFF_FLAG__TO_SPLIT; |
| break; |
| } |
| if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && |
| delta->similarity < opts->rename_from_rewrite_threshold) { |
| delta->flags |= GIT_DIFF_FLAG__TO_SPLIT; |
| break; |
| } |
| |
| return false; |
| |
| case GIT_DELTA_UNTRACKED: |
| if (!FLAG_SET(opts, GIT_DIFF_FIND_FOR_UNTRACKED)) |
| return false; |
| break; |
| |
| default: /* all other status values should be checked */ |
| break; |
| } |
| |
| delta->flags |= GIT_DIFF_FLAG__IS_RENAME_TARGET; |
| return true; |
| } |
| |
| static bool is_rename_source( |
| git_diff *diff, |
| const git_diff_find_options *opts, |
| size_t delta_idx, |
| void **cache) |
| { |
| git_diff_delta *delta = GIT_VECTOR_GET(&diff->deltas, delta_idx); |
| |
| /* skip things that aren't blobs */ |
| if (!GIT_MODE_ISBLOB(delta->old_file.mode)) |
| return false; |
| |
| switch (delta->status) { |
| case GIT_DELTA_ADDED: |
| case GIT_DELTA_UNTRACKED: |
| case GIT_DELTA_UNREADABLE: |
| case GIT_DELTA_IGNORED: |
| case GIT_DELTA_CONFLICTED: |
| return false; |
| |
| case GIT_DELTA_DELETED: |
| case GIT_DELTA_TYPECHANGE: |
| break; |
| |
| case GIT_DELTA_UNMODIFIED: |
| if (!FLAG_SET(opts, GIT_DIFF_FIND_COPIES_FROM_UNMODIFIED)) |
| return false; |
| if (FLAG_SET(opts, GIT_DIFF_FIND_REMOVE_UNMODIFIED)) |
| delta->flags |= GIT_DIFF_FLAG__TO_DELETE; |
| break; |
| |
| default: /* MODIFIED, RENAMED, COPIED */ |
| /* if we're finding copies, this could be a source */ |
| if (FLAG_SET(opts, GIT_DIFF_FIND_COPIES)) |
| break; |
| |
| /* otherwise, this is only a source if we can split it */ |
| if (!FLAG_SET(opts, GIT_DIFF_FIND_REWRITES) && |
| !FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES)) |
| return false; |
| |
| if (calc_self_similarity(diff, opts, delta_idx, cache) < 0) |
| return false; |
| |
| if (FLAG_SET(opts, GIT_DIFF_BREAK_REWRITES) && |
| delta->similarity < opts->break_rewrite_threshold) { |
| delta->flags |= GIT_DIFF_FLAG__TO_SPLIT; |
| break; |
| } |
| |
| if (FLAG_SET(opts, GIT_DIFF_FIND_RENAMES_FROM_REWRITES) && |
| delta->similarity < opts->rename_from_rewrite_threshold) |
| break; |
| |
| return false; |
| } |
| |
| delta->flags |= GIT_DIFF_FLAG__IS_RENAME_SOURCE; |
| return true; |
| } |
| |
| GIT_INLINE(bool) delta_is_split(git_diff_delta *delta) |
| { |
| return (delta->status == GIT_DELTA_TYPECHANGE || |
| (delta->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0); |
| } |
| |
| GIT_INLINE(bool) delta_is_new_only(git_diff_delta *delta) |
| { |
| return (delta->status == GIT_DELTA_ADDED || |
| delta->status == GIT_DELTA_UNTRACKED || |
| delta->status == GIT_DELTA_UNREADABLE || |
| delta->status == GIT_DELTA_IGNORED); |
| } |
| |
| GIT_INLINE(void) delta_make_rename( |
| git_diff_delta *to, const git_diff_delta *from, uint16_t similarity) |
| { |
| to->status = GIT_DELTA_RENAMED; |
| to->similarity = similarity; |
| to->nfiles = 2; |
| memcpy(&to->old_file, &from->old_file, sizeof(to->old_file)); |
| to->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; |
| } |
| |
| typedef struct { |
| size_t idx; |
| uint16_t similarity; |
| } diff_find_match; |
| |
| int git_diff_find_similar( |
| git_diff *diff, |
| const git_diff_find_options *given_opts) |
| { |
| size_t s, t; |
| int error = 0, result; |
| uint16_t similarity; |
| git_diff_delta *src, *tgt; |
| git_diff_find_options opts = GIT_DIFF_FIND_OPTIONS_INIT; |
| size_t num_deltas, num_srcs = 0, num_tgts = 0; |
| size_t tried_srcs = 0, tried_tgts = 0; |
| size_t num_rewrites = 0, num_updates = 0, num_bumped = 0; |
| size_t sigcache_size; |
| void **sigcache = NULL; /* cache of similarity metric file signatures */ |
| diff_find_match *tgt2src = NULL; |
| diff_find_match *src2tgt = NULL; |
| diff_find_match *tgt2src_copy = NULL; |
| diff_find_match *best_match; |
| git_diff_file swap; |
| |
| assert(diff); |
| |
| if ((error = normalize_find_opts(diff, &opts, given_opts)) < 0) |
| return error; |
| |
| num_deltas = diff->deltas.length; |
| |
| /* TODO: maybe abort if deltas.length > rename_limit ??? */ |
| if (!num_deltas || !git__is_uint32(num_deltas)) |
| goto cleanup; |
| |
| /* No flags set; nothing to do */ |
| if ((opts.flags & GIT_DIFF_FIND_ALL) == 0) |
| goto cleanup; |
| |
| GIT_ERROR_CHECK_ALLOC_MULTIPLY(&sigcache_size, num_deltas, 2); |
| sigcache = git__calloc(sigcache_size, sizeof(void *)); |
| GIT_ERROR_CHECK_ALLOC(sigcache); |
| |
| /* Label rename sources and targets |
| * |
| * This will also set self-similarity scores for MODIFIED files and |
| * mark them for splitting if break-rewrites is enabled |
| */ |
| git_vector_foreach(&diff->deltas, t, tgt) { |
| if (is_rename_source(diff, &opts, t, sigcache)) |
| ++num_srcs; |
| |
| if (is_rename_target(diff, &opts, t, sigcache)) |
| ++num_tgts; |
| |
| if ((tgt->flags & GIT_DIFF_FLAG__TO_SPLIT) != 0) |
| num_rewrites++; |
| } |
| |
| /* if there are no candidate srcs or tgts, we're done */ |
| if (!num_srcs || !num_tgts) |
| goto cleanup; |
| |
| src2tgt = git__calloc(num_deltas, sizeof(diff_find_match)); |
| GIT_ERROR_CHECK_ALLOC(src2tgt); |
| tgt2src = git__calloc(num_deltas, sizeof(diff_find_match)); |
| GIT_ERROR_CHECK_ALLOC(tgt2src); |
| |
| if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) { |
| tgt2src_copy = git__calloc(num_deltas, sizeof(diff_find_match)); |
| GIT_ERROR_CHECK_ALLOC(tgt2src_copy); |
| } |
| |
| /* |
| * Find best-fit matches for rename / copy candidates |
| */ |
| |
| find_best_matches: |
| tried_tgts = num_bumped = 0; |
| |
| git_vector_foreach(&diff->deltas, t, tgt) { |
| /* skip things that are not rename targets */ |
| if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) |
| continue; |
| |
| tried_srcs = 0; |
| |
| git_vector_foreach(&diff->deltas, s, src) { |
| /* skip things that are not rename sources */ |
| if ((src->flags & GIT_DIFF_FLAG__IS_RENAME_SOURCE) == 0) |
| continue; |
| |
| /* calculate similarity for this pair and find best match */ |
| if (s == t) |
| result = -1; /* don't measure self-similarity here */ |
| else if ((error = similarity_measure( |
| &result, diff, &opts, sigcache, 2 * s, 2 * t + 1)) < 0) |
| goto cleanup; |
| |
| if (result < 0) |
| continue; |
| similarity = (uint16_t)result; |
| |
| /* is this a better rename? */ |
| if (tgt2src[t].similarity < similarity && |
| src2tgt[s].similarity < similarity) |
| { |
| /* eject old mapping */ |
| if (src2tgt[s].similarity > 0) { |
| tgt2src[src2tgt[s].idx].similarity = 0; |
| num_bumped++; |
| } |
| if (tgt2src[t].similarity > 0) { |
| src2tgt[tgt2src[t].idx].similarity = 0; |
| num_bumped++; |
| } |
| |
| /* write new mapping */ |
| tgt2src[t].idx = s; |
| tgt2src[t].similarity = similarity; |
| src2tgt[s].idx = t; |
| src2tgt[s].similarity = similarity; |
| } |
| |
| /* keep best absolute match for copies */ |
| if (tgt2src_copy != NULL && |
| tgt2src_copy[t].similarity < similarity) |
| { |
| tgt2src_copy[t].idx = s; |
| tgt2src_copy[t].similarity = similarity; |
| } |
| |
| if (++tried_srcs >= num_srcs) |
| break; |
| |
| /* cap on maximum targets we'll examine (per "tgt" file) */ |
| if (tried_srcs > opts.rename_limit) |
| break; |
| } |
| |
| if (++tried_tgts >= num_tgts) |
| break; |
| } |
| |
| if (num_bumped > 0) /* try again if we bumped some items */ |
| goto find_best_matches; |
| |
| /* |
| * Rewrite the diffs with renames / copies |
| */ |
| |
| git_vector_foreach(&diff->deltas, t, tgt) { |
| /* skip things that are not rename targets */ |
| if ((tgt->flags & GIT_DIFF_FLAG__IS_RENAME_TARGET) == 0) |
| continue; |
| |
| /* check if this delta was the target of a similarity */ |
| if (tgt2src[t].similarity) |
| best_match = &tgt2src[t]; |
| else if (tgt2src_copy && tgt2src_copy[t].similarity) |
| best_match = &tgt2src_copy[t]; |
| else |
| continue; |
| |
| s = best_match->idx; |
| src = GIT_VECTOR_GET(&diff->deltas, s); |
| |
| /* possible scenarios: |
| * 1. from DELETE to ADD/UNTRACK/IGNORE = RENAME |
| * 2. from DELETE to SPLIT/TYPECHANGE = RENAME + DELETE |
| * 3. from SPLIT/TYPECHANGE to ADD/UNTRACK/IGNORE = ADD + RENAME |
| * 4. from SPLIT/TYPECHANGE to SPLIT/TYPECHANGE = RENAME + SPLIT |
| * 5. from OTHER to ADD/UNTRACK/IGNORE = OTHER + COPY |
| */ |
| |
| if (src->status == GIT_DELTA_DELETED) { |
| |
| if (delta_is_new_only(tgt)) { |
| |
| if (best_match->similarity < opts.rename_threshold) |
| continue; |
| |
| delta_make_rename(tgt, src, best_match->similarity); |
| |
| src->flags |= GIT_DIFF_FLAG__TO_DELETE; |
| num_rewrites++; |
| } else { |
| assert(delta_is_split(tgt)); |
| |
| if (best_match->similarity < opts.rename_from_rewrite_threshold) |
| continue; |
| |
| memcpy(&swap, &tgt->old_file, sizeof(swap)); |
| |
| delta_make_rename(tgt, src, best_match->similarity); |
| num_rewrites--; |
| |
| assert(src->status == GIT_DELTA_DELETED); |
| memcpy(&src->old_file, &swap, sizeof(src->old_file)); |
| memset(&src->new_file, 0, sizeof(src->new_file)); |
| src->new_file.path = src->old_file.path; |
| src->new_file.flags |= GIT_DIFF_FLAG_VALID_ID; |
| |
| num_updates++; |
| |
| if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) { |
| /* what used to be at src t is now at src s */ |
| tgt2src[src2tgt[t].idx].idx = s; |
| } |
| } |
| } |
| |
| else if (delta_is_split(src)) { |
| |
| if (delta_is_new_only(tgt)) { |
| |
| if (best_match->similarity < opts.rename_threshold) |
| continue; |
| |
| delta_make_rename(tgt, src, best_match->similarity); |
| |
| src->status = (diff->new_src == GIT_ITERATOR_TYPE_WORKDIR) ? |
| GIT_DELTA_UNTRACKED : GIT_DELTA_ADDED; |
| src->nfiles = 1; |
| memset(&src->old_file, 0, sizeof(src->old_file)); |
| src->old_file.path = src->new_file.path; |
| src->old_file.flags |= GIT_DIFF_FLAG_VALID_ID; |
| |
| src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; |
| num_rewrites--; |
| |
| num_updates++; |
| } else { |
| assert(delta_is_split(src)); |
| |
| if (best_match->similarity < opts.rename_from_rewrite_threshold) |
| continue; |
| |
| memcpy(&swap, &tgt->old_file, sizeof(swap)); |
| |
| delta_make_rename(tgt, src, best_match->similarity); |
| num_rewrites--; |
| num_updates++; |
| |
| memcpy(&src->old_file, &swap, sizeof(src->old_file)); |
| |
| /* if we've just swapped the new element into the correct |
| * place, clear the SPLIT flag |
| */ |
| if (tgt2src[s].idx == t && |
| tgt2src[s].similarity > |
| opts.rename_from_rewrite_threshold) { |
| src->status = GIT_DELTA_RENAMED; |
| src->similarity = tgt2src[s].similarity; |
| tgt2src[s].similarity = 0; |
| src->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; |
| num_rewrites--; |
| } |
| /* otherwise, if we just overwrote a source, update mapping */ |
| else if (src2tgt[t].similarity > 0 && src2tgt[t].idx > t) { |
| /* what used to be at src t is now at src s */ |
| tgt2src[src2tgt[t].idx].idx = s; |
| } |
| |
| num_updates++; |
| } |
| } |
| |
| else if (FLAG_SET(&opts, GIT_DIFF_FIND_COPIES)) { |
| if (tgt2src_copy[t].similarity < opts.copy_threshold) |
| continue; |
| |
| /* always use best possible source for copy */ |
| best_match = &tgt2src_copy[t]; |
| src = GIT_VECTOR_GET(&diff->deltas, best_match->idx); |
| |
| if (delta_is_split(tgt)) { |
| error = insert_delete_side_of_split(diff, &diff->deltas, tgt); |
| if (error < 0) |
| goto cleanup; |
| num_rewrites--; |
| } |
| |
| if (!delta_is_split(tgt) && !delta_is_new_only(tgt)) |
| continue; |
| |
| tgt->status = GIT_DELTA_COPIED; |
| tgt->similarity = best_match->similarity; |
| tgt->nfiles = 2; |
| memcpy(&tgt->old_file, &src->old_file, sizeof(tgt->old_file)); |
| tgt->flags &= ~GIT_DIFF_FLAG__TO_SPLIT; |
| |
| num_updates++; |
| } |
| } |
| |
| /* |
| * Actually split and delete entries as needed |
| */ |
| |
| if (num_rewrites > 0 || num_updates > 0) |
| error = apply_splits_and_deletes( |
| diff, diff->deltas.length - num_rewrites, |
| FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES) && |
| !FLAG_SET(&opts, GIT_DIFF_BREAK_REWRITES_FOR_RENAMES_ONLY)); |
| |
| cleanup: |
| git__free(tgt2src); |
| git__free(src2tgt); |
| git__free(tgt2src_copy); |
| |
| if (sigcache) { |
| for (t = 0; t < num_deltas * 2; ++t) { |
| if (sigcache[t] != NULL) |
| opts.metric->free_signature(sigcache[t], opts.metric->payload); |
| } |
| git__free(sigcache); |
| } |
| |
| if (!given_opts || !given_opts->metric) |
| git__free(opts.metric); |
| |
| return error; |
| } |
| |
| #undef FLAG_SET |