//! ### Deviation
//!
//! Note that the algorithm implemented here is in many ways different from what `git` does.
//!
//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match.
//! - the set used for copy-detection is probably smaller by default.
use std::ops::Range;

use bstr::BStr;
use gix_object::tree::{EntryKind, EntryMode};

use crate::{
    blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
    rewrites::{CopySource, Outcome, Tracker},
    Rewrites,
};

/// The kind of a change.
#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
pub enum ChangeKind {
    /// The change represents the *deletion* of an item.
    Deletion,
    /// The change represents the *modification* of an item.
    Modification,
    /// The change represents the *addition* of an item.
    Addition,
}

/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`].
pub trait Change: Clone {
    /// Return the hash of this change for identification.
    ///
    /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace
    /// conversions.
    fn id(&self) -> &gix_hash::oid;
    /// Return the kind of this change.
    fn kind(&self) -> ChangeKind;
    /// Return more information about the kind of entry affected by this change.
    fn entry_mode(&self) -> EntryMode;
    /// Return the id of the change along with its mode.
    fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
}

/// A set of tracked items allows to figure out their relations by figuring out their similarity.
pub(crate) struct Item<T> {
    /// The underlying raw change
    change: T,
    /// That slice into the backing for paths.
    path: Range<usize>,
    /// If true, this item was already emitted, i.e. seen by the caller.
    emitted: bool,
}

impl<T: Change> Item<T> {
    fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
        backing[self.path.clone()].as_ref()
    }
    fn entry_mode_compatible(&self, mode: EntryMode) -> bool {
        use EntryKind::*;
        matches!(
            (mode.kind(), self.change.entry_mode().kind()),
            (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link)
        )
    }

    fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
        self.entry_mode_compatible(dest_item_mode)
            && match kind {
                visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
                visit::SourceKind::Copy => {
                    matches!(self.change.kind(), ChangeKind::Modification)
                }
            }
    }
}

/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()).
pub mod visit {
    use bstr::BStr;
    use gix_object::tree::EntryMode;

    use crate::blob::DiffLineStats;

    /// The source of a rewrite, rename or copy.
    #[derive(Debug, Clone, PartialEq, PartialOrd)]
    pub struct Source<'a, T> {
        /// The kind of entry.
        pub entry_mode: EntryMode,
        /// The hash of the state of the source as seen in the object database.
        pub id: gix_hash::ObjectId,
        /// Further specify what kind of source this is.
        pub kind: SourceKind,
        /// The repository-relative location of this entry.
        pub location: &'a BStr,
        /// The change that was registered as source.
        pub change: &'a T,
        /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination.
        pub diff: Option<DiffLineStats>,
    }

    /// Further identify the kind of [Source].
    #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
    pub enum SourceKind {
        /// This is the source of an entry that was renamed, as `source` was renamed to `destination`.
        Rename,
        /// This is the source of a copy, as `source` was copied into `destination`.
        Copy,
    }

    /// A change along with a location.
    #[derive(Clone)]
    pub struct Destination<'a, T: Clone> {
        /// The change at the given `location`.
        pub change: T,
        /// The repository-relative location of this destination.
        pub location: &'a BStr,
    }
}

///
#[allow(clippy::empty_docs)]
pub mod emit {
    /// The error returned by [Tracker::emit()](super::Tracker::emit()).
    #[derive(Debug, thiserror::Error)]
    #[allow(missing_docs)]
    pub enum Error {
        #[error("Could not find blob for similarity checking")]
        FindExistingBlob(#[from] gix_object::find::existing_object::Error),
        #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
        GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
        #[error(transparent)]
        SetResource(#[from] crate::blob::platform::set_resource::Error),
        #[error(transparent)]
        PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
    }
}

/// Lifecycle
impl<T: Change> Tracker<T> {
    /// Create a new instance with `rewrites` configuration.
    pub fn new(rewrites: Rewrites) -> Self {
        Tracker {
            items: vec![],
            path_backing: vec![],
            rewrites,
        }
    }
}

/// build state and find matches.
impl<T: Change> Tracker<T> {
    /// We may refuse the push if that information isn't needed for what we have to track.
    pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
        if !change.entry_mode().is_blob_or_symlink() {
            return Some(change);
        }
        let keep = match (self.rewrites.copies, change.kind()) {
            (Some(_find_copies), _) => true,
            (None, ChangeKind::Modification { .. }) => false,
            (None, _) => true,
        };

        if !keep {
            return Some(change);
        }

        let start = self.path_backing.len();
        self.path_backing.extend_from_slice(location);
        self.items.push(Item {
            path: start..self.path_backing.len(),
            change,
            emitted: false,
        });
        None
    }

    /// Can only be called once effectively as it alters its own state to assure each item is only emitted once.
    ///
    /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
    /// the destination of a copy or rename, or with `None` for source if no relation to other
    /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened.
    ///
    /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
    /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
    /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
    /// through conversion steps.
    ///
    /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
    /// the diff-algorithm (provided no user-algorithm is set).
    /// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
    /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
    /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
    ///
    /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
    /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
    /// will panic if `change` is not a modification, and it's valid to not call `push` at all.
    pub fn emit<PushSourceTreeFn, E>(
        &mut self,
        mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> crate::tree::visit::Action,
        diff_cache: &mut crate::blob::Platform,
        objects: &impl gix_object::FindObjectOrHeader,
        mut push_source_tree: PushSourceTreeFn,
    ) -> Result<Outcome, emit::Error>
    where
        PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
        E: std::error::Error + Send + Sync + 'static,
    {
        diff_cache.options.skip_internal_diff_if_external_is_configured = false;

        fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
            a.change
                .id()
                .cmp(b.change.id())
                .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
        }
        self.items.sort_by(by_id_and_location);

        let mut out = Outcome {
            options: self.rewrites,
            ..Default::default()
        };
        self.match_pairs_of_kind(
            visit::SourceKind::Rename,
            &mut cb,
            self.rewrites.percentage,
            &mut out,
            diff_cache,
            objects,
        )?;

        if let Some(copies) = self.rewrites.copies {
            self.match_pairs_of_kind(
                visit::SourceKind::Copy,
                &mut cb,
                copies.percentage,
                &mut out,
                diff_cache,
                objects,
            )?;

            match copies.source {
                CopySource::FromSetOfModifiedFiles => {}
                CopySource::FromSetOfModifiedFilesAndAllSources => {
                    push_source_tree(&mut |change, location| {
                        assert!(
                            self.try_push_change(change, location).is_none(),
                            "we must accept every change"
                        );
                        // make sure these aren't viable to be emitted anymore.
                        self.items.last_mut().expect("just pushed").emitted = true;
                    })
                    .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
                    self.items.sort_by(by_id_and_location);

                    self.match_pairs_of_kind(
                        visit::SourceKind::Copy,
                        &mut cb,
                        copies.percentage,
                        &mut out,
                        diff_cache,
                        objects,
                    )?;
                }
            }
        }

        self.items
            .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
        for item in self.items.drain(..).filter(|item| !item.emitted) {
            if cb(
                visit::Destination {
                    location: item.location(&self.path_backing),
                    change: item.change,
                },
                None,
            ) == crate::tree::visit::Action::Cancel
            {
                break;
            }
        }
        Ok(out)
    }
}

impl<T: Change> Tracker<T> {
    fn match_pairs_of_kind(
        &mut self,
        kind: visit::SourceKind,
        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> crate::tree::visit::Action,
        percentage: Option<f32>,
        out: &mut Outcome,
        diff_cache: &mut crate::blob::Platform,
        objects: &impl gix_object::FindObjectOrHeader,
    ) -> Result<(), emit::Error> {
        // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
        let needs_second_pass = !needs_exact_match(percentage);
        if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)?
            == crate::tree::visit::Action::Cancel
        {
            return Ok(());
        }
        if needs_second_pass {
            let is_limited = if self.rewrites.limit == 0 {
                false
            } else {
                let (num_src, num_dst) =
                    estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
                let permutations = num_src * num_dst;
                if permutations > self.rewrites.limit {
                    match kind {
                        visit::SourceKind::Rename => {
                            out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
                        }
                        visit::SourceKind::Copy => {
                            out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
                        }
                    }
                    true
                } else {
                    false
                }
            };
            if !is_limited {
                self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?;
            }
        }
        Ok(())
    }

    fn match_pairs(
        &mut self,
        cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_, T>>) -> crate::tree::visit::Action,
        percentage: Option<f32>,
        kind: visit::SourceKind,
        stats: &mut Outcome,
        diff_cache: &mut crate::blob::Platform,
        objects: &impl gix_object::FindObjectOrHeader,
    ) -> Result<crate::tree::visit::Action, emit::Error> {
        let mut dest_ofs = 0;
        while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
            (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item))
        }) {
            dest_idx += dest_ofs;
            dest_ofs = dest_idx + 1;
            let src = find_match(
                &self.items,
                dest,
                dest_idx,
                percentage,
                kind,
                stats,
                objects,
                diff_cache,
                &self.path_backing,
            )?
            .map(|(src_idx, src, diff)| {
                let (id, entry_mode) = src.change.id_and_entry_mode();
                let id = id.to_owned();
                let location = src.location(&self.path_backing);
                (
                    visit::Source {
                        entry_mode,
                        id,
                        kind,
                        location,
                        change: &src.change,
                        diff,
                    },
                    src_idx,
                )
            });
            if src.is_none() {
                continue;
            }
            let location = dest.location(&self.path_backing);
            let change = dest.change.clone();
            let dest = visit::Destination { change, location };
            let src_idx = src.as_ref().map(|t| t.1);
            let res = cb(dest, src.map(|t| t.0));

            self.items[dest_idx].emitted = true;
            if let Some(src_idx) = src_idx {
                self.items[src_idx].emitted = true;
            }

            if res == crate::tree::visit::Action::Cancel {
                return Ok(crate::tree::visit::Action::Cancel);
            }
        }
        Ok(crate::tree::visit::Action::Continue)
    }
}

/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation.
fn estimate_involved_items(
    items: impl IntoIterator<Item = (bool, ChangeKind)>,
    kind: visit::SourceKind,
) -> (usize, usize) {
    items
        .into_iter()
        .filter(|(emitted, _)| match kind {
            visit::SourceKind::Rename => !*emitted,
            visit::SourceKind::Copy => true,
        })
        .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
            match change_kind {
                ChangeKind::Addition => {
                    if kind == visit::SourceKind::Rename || !emitted {
                        dest += 1;
                    }
                }
                ChangeKind::Deletion => {
                    if kind == visit::SourceKind::Rename {
                        src += 1
                    }
                }
                ChangeKind::Modification => {
                    if kind == visit::SourceKind::Copy {
                        src += 1
                    }
                }
            }
            (src, dest)
        })
}

fn needs_exact_match(percentage: Option<f32>) -> bool {
    percentage.map_or(true, |p| p >= 1.0)
}

/// <`src_idx`, src, possibly diff stat>
type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);

/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
/// We also ignore emitted items entirely.
/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
/// any non-deletion otherwise.
/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
/// of items to be searched.
#[allow(clippy::too_many_arguments)]
fn find_match<'a, T: Change>(
    items: &'a [Item<T>],
    item: &Item<T>,
    item_idx: usize,
    percentage: Option<f32>,
    kind: visit::SourceKind,
    stats: &mut Outcome,
    objects: &impl gix_object::FindObjectOrHeader,
    diff_cache: &mut crate::blob::Platform,
    path_backing: &[u8],
) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
    let (item_id, item_mode) = item.change.id_and_entry_mode();
    if needs_exact_match(percentage) || item_mode.is_link() {
        let first_idx = items.partition_point(|a| a.change.id() < item_id);
        let range = items.get(first_idx..).map(|items| {
            let end = items
                .iter()
                .position(|a| a.change.id() != item_id)
                .map_or(items.len(), |idx| first_idx + idx);
            first_idx..end
        });
        let range = match range {
            Some(range) => range,
            None => return Ok(None),
        };
        if range.is_empty() {
            return Ok(None);
        }
        let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
            src_idx += range.start;
            (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
        });
        if let Some(src) = res {
            return Ok(Some(src));
        }
    } else {
        let mut has_new = false;
        let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
        debug_assert!(
            item_mode.is_blob(),
            "symlinks are matched exactly, and trees aren't used here"
        );

        for (can_idx, src) in items
            .iter()
            .enumerate()
            .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
        {
            if !has_new {
                diff_cache.set_resource(
                    item_id.to_owned(),
                    item_mode.kind(),
                    item.location(path_backing),
                    ResourceKind::NewOrDestination,
                    objects,
                )?;
                has_new = true;
            }
            let (src_id, src_mode) = src.change.id_and_entry_mode();
            diff_cache.set_resource(
                src_id.to_owned(),
                src_mode.kind(),
                src.location(path_backing),
                ResourceKind::OldOrSource,
                objects,
            )?;
            let prep = diff_cache.prepare_diff()?;
            stats.num_similarity_checks += 1;
            match prep.operation {
                Operation::InternalDiff { algorithm } => {
                    let tokens =
                        crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
                    let counts = crate::blob::diff(
                        algorithm,
                        &tokens,
                        crate::blob::sink::Counter::new(diff::Statistics {
                            removed_bytes: 0,
                            input: &tokens,
                        }),
                    );
                    let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
                    let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
                    let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
                    if similarity >= percentage {
                        return Ok(Some((
                            can_idx,
                            src,
                            DiffLineStats {
                                removals: counts.removals,
                                insertions: counts.insertions,
                                before: tokens.before.len().try_into().expect("interner handles only u32"),
                                after: tokens.after.len().try_into().expect("interner handles only u32"),
                                similarity,
                            }
                            .into(),
                        )));
                    }
                }
                Operation::ExternalCommand { .. } => {
                    unreachable!("we have disabled this possibility with an option")
                }
                Operation::SourceOrDestinationIsBinary => {
                    // TODO: figure out if git does more here
                }
            };
        }
    }
    Ok(None)
}

mod diff {
    use std::ops::Range;

    pub struct Statistics<'a, 'data> {
        pub removed_bytes: usize,
        pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
    }

    impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> {
        type Out = usize;

        fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
            self.removed_bytes = self.input.before[before.start as usize..before.end as usize]
                .iter()
                .map(|token| self.input.interner[*token].len())
                .sum();
        }

        fn finish(self) -> Self::Out {
            self.removed_bytes
        }
    }
}

#[cfg(test)]
mod estimate_involved_items {
    use super::estimate_involved_items;
    use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};

    #[test]
    fn renames_count_unemitted_as_sources_and_destinations() {
        let items = [
            (false, ChangeKind::Addition),
            (true, ChangeKind::Deletion),
            (true, ChangeKind::Deletion),
        ];
        assert_eq!(
            estimate_involved_items(items, SourceKind::Rename),
            (0, 1),
            "here we only have one eligible source, hence nothing to do"
        );
        assert_eq!(
            estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
            (2, 1),
            "now we have more possibilities as renames count un-emitted deletions as source"
        );
    }

    #[test]
    fn copies_do_not_count_additions_as_sources() {
        let items = [
            (false, ChangeKind::Addition),
            (true, ChangeKind::Addition),
            (true, ChangeKind::Deletion),
        ];
        assert_eq!(
            estimate_involved_items(items, SourceKind::Copy),
            (0, 1),
            "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
            deletions don't count"
        );
    }

    #[test]
    fn copies_count_modifications_as_sources() {
        let items = [
            (false, ChangeKind::Addition),
            (true, ChangeKind::Modification),
            (false, ChangeKind::Modification),
        ];
        assert_eq!(
            estimate_involved_items(items, SourceKind::Copy),
            (2, 1),
            "any modifications is a valid source, emitted or not"
        );
    }
}
