Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 1 | // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "components/zucchini/equivalence_map.h" |
| 6 | |
| 7 | #include <algorithm> |
| 8 | #include <utility> |
| 9 | |
Lei Zhang | 7f0a390 | 2021-06-30 16:14:59 +0000 | [diff] [blame] | 10 | #include "base/containers/cxx20_erase.h" |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 11 | #include "base/logging.h" |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 12 | #include "base/numerics/safe_conversions.h" |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 13 | #include "components/zucchini/encoded_view.h" |
| 14 | #include "components/zucchini/patch_reader.h" |
| 15 | #include "components/zucchini/suffix_array.h" |
| 16 | |
| 17 | namespace zucchini { |
| 18 | |
Calder Kitagawa | 2ed3877 | 2018-06-28 15:32:16 +0000 | [diff] [blame] | 19 | namespace { |
| 20 | |
| 21 | // TODO(haungs): Tune these numbers to improve pathological case results. |
| 22 | |
| 23 | // In pathological cases Zucchini can exhibit O(n^2) behavior if the seed |
| 24 | // selection process runs to completion. To prevent this we impose a quota for |
| 25 | // the total length of equivalences the seed selection process can perform |
| 26 | // trials on. For regular use cases it is unlikely this quota will be exceeded, |
| 27 | // and if it is the effects on patch size are expected to be small. |
| 28 | constexpr uint64_t kSeedSelectionTotalVisitLengthQuota = 1 << 18; // 256 KiB |
| 29 | |
| 30 | // The aforementioned quota alone is insufficient, as exploring backwards will |
| 31 | // still be very successful resulting in O(n) behavior in the case of a limited |
| 32 | // seed selection trials. This results in O(n^2) behavior returning. To mitigate |
| 33 | // this we also impose a cap on the ExtendEquivalenceBackward() exploration. |
| 34 | constexpr offset_t kBackwardsExtendLimit = 1 << 16; // 64 KiB |
| 35 | |
| 36 | } // namespace |
| 37 | |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 38 | /******** Utility Functions ********/ |
| 39 | |
| 40 | double GetTokenSimilarity( |
| 41 | const ImageIndex& old_image_index, |
| 42 | const ImageIndex& new_image_index, |
| 43 | const std::vector<TargetsAffinity>& targets_affinities, |
| 44 | offset_t src, |
| 45 | offset_t dst) { |
| 46 | DCHECK(old_image_index.IsToken(src)); |
| 47 | DCHECK(new_image_index.IsToken(dst)); |
| 48 | |
| 49 | TypeTag old_type = old_image_index.LookupType(src); |
| 50 | TypeTag new_type = new_image_index.LookupType(dst); |
| 51 | if (old_type != new_type) |
| 52 | return kMismatchFatal; |
| 53 | |
| 54 | // Raw comparison. |
| 55 | if (!old_image_index.IsReference(src) && !new_image_index.IsReference(dst)) { |
| 56 | return old_image_index.GetRawValue(src) == new_image_index.GetRawValue(dst) |
| 57 | ? 1.0 |
| 58 | : -1.5; |
| 59 | } |
| 60 | |
| 61 | const ReferenceSet& old_ref_set = old_image_index.refs(old_type); |
| 62 | const ReferenceSet& new_ref_set = new_image_index.refs(new_type); |
Etienne Pierre-doray | 8f9a9e7 | 2018-08-13 18:49:00 +0000 | [diff] [blame] | 63 | Reference old_reference = old_ref_set.at(src); |
| 64 | Reference new_reference = new_ref_set.at(dst); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 65 | PoolTag pool_tag = old_ref_set.pool_tag(); |
| 66 | |
| 67 | double affinity = targets_affinities[pool_tag.value()].AffinityBetween( |
Etienne Pierre-doray | 8f9a9e7 | 2018-08-13 18:49:00 +0000 | [diff] [blame] | 68 | old_ref_set.target_pool().KeyForOffset(old_reference.target), |
| 69 | new_ref_set.target_pool().KeyForOffset(new_reference.target)); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 70 | |
| 71 | // Both targets are not associated, which implies a weak match. |
| 72 | if (affinity == 0.0) |
| 73 | return 0.5 * old_ref_set.width(); |
| 74 | |
| 75 | // At least one target is associated, so values are compared. |
| 76 | return affinity > 0.0 ? old_ref_set.width() : -2.0; |
| 77 | } |
| 78 | |
| 79 | double GetEquivalenceSimilarity( |
| 80 | const ImageIndex& old_image_index, |
| 81 | const ImageIndex& new_image_index, |
| 82 | const std::vector<TargetsAffinity>& targets_affinities, |
| 83 | const Equivalence& equivalence) { |
| 84 | double similarity = 0.0; |
| 85 | for (offset_t k = 0; k < equivalence.length; ++k) { |
| 86 | // Non-tokens are joined with the nearest previous token: skip until we |
| 87 | // cover the unit. |
| 88 | if (!new_image_index.IsToken(equivalence.dst_offset + k)) |
| 89 | continue; |
| 90 | |
| 91 | similarity += GetTokenSimilarity( |
| 92 | old_image_index, new_image_index, targets_affinities, |
| 93 | equivalence.src_offset + k, equivalence.dst_offset + k); |
| 94 | if (similarity == kMismatchFatal) |
| 95 | return kMismatchFatal; |
| 96 | } |
| 97 | return similarity; |
| 98 | } |
| 99 | |
| 100 | EquivalenceCandidate ExtendEquivalenceForward( |
| 101 | const ImageIndex& old_image_index, |
| 102 | const ImageIndex& new_image_index, |
| 103 | const std::vector<TargetsAffinity>& targets_affinities, |
| 104 | const EquivalenceCandidate& candidate, |
| 105 | double min_similarity) { |
| 106 | Equivalence equivalence = candidate.eq; |
| 107 | offset_t best_k = equivalence.length; |
| 108 | double current_similarity = candidate.similarity; |
| 109 | double best_similarity = current_similarity; |
| 110 | double current_penalty = min_similarity; |
| 111 | for (offset_t k = best_k; |
| 112 | equivalence.src_offset + k < old_image_index.size() && |
| 113 | equivalence.dst_offset + k < new_image_index.size(); |
| 114 | ++k) { |
| 115 | // Mismatch in type, |candidate| cannot be extended further. |
| 116 | if (old_image_index.LookupType(equivalence.src_offset + k) != |
| 117 | new_image_index.LookupType(equivalence.dst_offset + k)) { |
| 118 | break; |
| 119 | } |
| 120 | |
| 121 | if (!new_image_index.IsToken(equivalence.dst_offset + k)) { |
| 122 | // Non-tokens are joined with the nearest previous token: skip until we |
| 123 | // cover the unit, and extend |best_k| if applicable. |
| 124 | if (best_k == k) |
| 125 | best_k = k + 1; |
| 126 | continue; |
| 127 | } |
| 128 | |
| 129 | double similarity = GetTokenSimilarity( |
| 130 | old_image_index, new_image_index, targets_affinities, |
| 131 | equivalence.src_offset + k, equivalence.dst_offset + k); |
| 132 | current_similarity += similarity; |
| 133 | current_penalty = std::max(0.0, current_penalty) - similarity; |
| 134 | |
| 135 | if (current_similarity < 0.0 || current_penalty >= min_similarity) |
| 136 | break; |
| 137 | if (current_similarity >= best_similarity) { |
| 138 | best_similarity = current_similarity; |
| 139 | best_k = k + 1; |
| 140 | } |
| 141 | } |
| 142 | equivalence.length = best_k; |
| 143 | return {equivalence, best_similarity}; |
| 144 | } |
| 145 | |
| 146 | EquivalenceCandidate ExtendEquivalenceBackward( |
| 147 | const ImageIndex& old_image_index, |
| 148 | const ImageIndex& new_image_index, |
| 149 | const std::vector<TargetsAffinity>& targets_affinities, |
| 150 | const EquivalenceCandidate& candidate, |
| 151 | double min_similarity) { |
| 152 | Equivalence equivalence = candidate.eq; |
| 153 | offset_t best_k = 0; |
| 154 | double current_similarity = candidate.similarity; |
| 155 | double best_similarity = current_similarity; |
| 156 | double current_penalty = 0.0; |
Calder Kitagawa | 2ed3877 | 2018-06-28 15:32:16 +0000 | [diff] [blame] | 157 | offset_t k_min = std::min( |
| 158 | {equivalence.dst_offset, equivalence.src_offset, kBackwardsExtendLimit}); |
| 159 | for (offset_t k = 1; k <= k_min; ++k) { |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 160 | // Mismatch in type, |candidate| cannot be extended further. |
| 161 | if (old_image_index.LookupType(equivalence.src_offset - k) != |
| 162 | new_image_index.LookupType(equivalence.dst_offset - k)) { |
| 163 | break; |
| 164 | } |
| 165 | |
| 166 | // Non-tokens are joined with the nearest previous token: skip until we |
| 167 | // reach the next token. |
| 168 | if (!new_image_index.IsToken(equivalence.dst_offset - k)) |
| 169 | continue; |
| 170 | |
| 171 | DCHECK_EQ(old_image_index.LookupType(equivalence.src_offset - k), |
| 172 | new_image_index.LookupType(equivalence.dst_offset - |
| 173 | k)); // Sanity check. |
| 174 | double similarity = GetTokenSimilarity( |
| 175 | old_image_index, new_image_index, targets_affinities, |
| 176 | equivalence.src_offset - k, equivalence.dst_offset - k); |
| 177 | |
| 178 | current_similarity += similarity; |
| 179 | current_penalty = std::max(0.0, current_penalty) - similarity; |
| 180 | |
| 181 | if (current_similarity < 0.0 || current_penalty >= min_similarity) |
| 182 | break; |
| 183 | if (current_similarity >= best_similarity) { |
| 184 | best_similarity = current_similarity; |
| 185 | best_k = k; |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | equivalence.dst_offset -= best_k; |
| 190 | equivalence.src_offset -= best_k; |
| 191 | equivalence.length += best_k; |
| 192 | return {equivalence, best_similarity}; |
| 193 | } |
| 194 | |
| 195 | EquivalenceCandidate VisitEquivalenceSeed( |
| 196 | const ImageIndex& old_image_index, |
| 197 | const ImageIndex& new_image_index, |
| 198 | const std::vector<TargetsAffinity>& targets_affinities, |
| 199 | offset_t src, |
| 200 | offset_t dst, |
| 201 | double min_similarity) { |
| 202 | EquivalenceCandidate candidate{{src, dst, 0}, 0.0}; // Empty. |
| 203 | if (!old_image_index.IsToken(src)) |
| 204 | return candidate; |
| 205 | candidate = |
| 206 | ExtendEquivalenceForward(old_image_index, new_image_index, |
| 207 | targets_affinities, candidate, min_similarity); |
| 208 | if (candidate.similarity < min_similarity) |
| 209 | return candidate; // Not worth exploring any more. |
| 210 | return ExtendEquivalenceBackward(old_image_index, new_image_index, |
| 211 | targets_affinities, candidate, |
| 212 | min_similarity); |
| 213 | } |
| 214 | |
| 215 | /******** OffsetMapper ********/ |
| 216 | |
Etienne Pierre-doray | 8bb965d | 2021-10-29 14:12:23 +0000 | [diff] [blame] | 217 | OffsetMapper::OffsetMapper(std::deque<Equivalence>&& equivalences, |
Etienne Pierre-doray | 725a873 | 2018-09-10 16:19:33 +0000 | [diff] [blame] | 218 | offset_t old_image_size, |
| 219 | offset_t new_image_size) |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 220 | : equivalences_(std::move(equivalences)), |
| 221 | old_image_size_(old_image_size), |
| 222 | new_image_size_(new_image_size) { |
| 223 | DCHECK_GT(new_image_size_, 0U); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 224 | DCHECK(std::is_sorted(equivalences_.begin(), equivalences_.end(), |
| 225 | [](const Equivalence& a, const Equivalence& b) { |
| 226 | return a.src_offset < b.src_offset; |
| 227 | })); |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 228 | // This is for testing. Assume pruned. |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 229 | } |
| 230 | |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 231 | OffsetMapper::OffsetMapper(EquivalenceSource&& equivalence_source, |
Etienne Pierre-doray | 725a873 | 2018-09-10 16:19:33 +0000 | [diff] [blame] | 232 | offset_t old_image_size, |
| 233 | offset_t new_image_size) |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 234 | : old_image_size_(old_image_size), new_image_size_(new_image_size) { |
| 235 | DCHECK_GT(new_image_size_, 0U); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 236 | for (auto e = equivalence_source.GetNext(); e.has_value(); |
| 237 | e = equivalence_source.GetNext()) { |
| 238 | equivalences_.push_back(*e); |
| 239 | } |
| 240 | PruneEquivalencesAndSortBySource(&equivalences_); |
| 241 | } |
| 242 | |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 243 | OffsetMapper::OffsetMapper(const EquivalenceMap& equivalence_map, |
Etienne Pierre-doray | 725a873 | 2018-09-10 16:19:33 +0000 | [diff] [blame] | 244 | offset_t old_image_size, |
| 245 | offset_t new_image_size) |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 246 | : equivalences_(equivalence_map.size()), |
| 247 | old_image_size_(old_image_size), |
| 248 | new_image_size_(new_image_size) { |
| 249 | DCHECK_GT(new_image_size_, 0U); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 250 | std::transform(equivalence_map.begin(), equivalence_map.end(), |
| 251 | equivalences_.begin(), |
| 252 | [](const EquivalenceCandidate& c) { return c.eq; }); |
| 253 | PruneEquivalencesAndSortBySource(&equivalences_); |
| 254 | } |
| 255 | |
| 256 | OffsetMapper::~OffsetMapper() = default; |
| 257 | |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 258 | // Safely evaluates |offset - unit.src_offset + unit.dst_offset| with signed |
| 259 | // arithmetic, then clips the result to |[0, new_image_size_)|. |
| 260 | offset_t OffsetMapper::NaiveExtendedForwardProject(const Equivalence& unit, |
| 261 | offset_t offset) const { |
| 262 | int64_t old_offset64 = offset; |
| 263 | int64_t src_offset64 = unit.src_offset; |
| 264 | int64_t dst_offset64 = unit.dst_offset; |
| 265 | uint64_t new_offset64 = std::min<uint64_t>( |
| 266 | std::max<int64_t>(0LL, old_offset64 - src_offset64 + dst_offset64), |
| 267 | new_image_size_ - 1); |
| 268 | return base::checked_cast<offset_t>(new_offset64); |
| 269 | } |
| 270 | |
| 271 | offset_t OffsetMapper::ExtendedForwardProject(offset_t offset) const { |
| 272 | DCHECK(!equivalences_.empty()); |
| 273 | if (offset < old_image_size_) { |
| 274 | // Finds the equivalence unit whose "old" block is nearest to |offset|, |
| 275 | // favoring the block with lower offset in case of a tie. |
| 276 | auto pos = std::upper_bound( |
| 277 | equivalences_.begin(), equivalences_.end(), offset, |
| 278 | [](offset_t a, const Equivalence& b) { return a < b.src_offset; }); |
| 279 | // For tiebreaking: |offset - pos[-1].src_end()| is actually 1 less than |
| 280 | // |offset|'s distance to "old" block of |pos[-1]|. Therefore "<" is used. |
| 281 | if (pos != equivalences_.begin() && |
| 282 | (pos == equivalences_.end() || offset < pos[-1].src_end() || |
| 283 | offset - pos[-1].src_end() < pos->src_offset - offset)) { |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 284 | --pos; |
| 285 | } |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 286 | return NaiveExtendedForwardProject(*pos, offset); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 287 | } |
Samuel Huang | 8fdb8ba | 2018-06-26 14:47:02 +0000 | [diff] [blame] | 288 | // Fake offsets. |
| 289 | offset_t delta = offset - old_image_size_; |
| 290 | return delta < kOffsetBound - new_image_size_ ? new_image_size_ + delta |
| 291 | : kOffsetBound - 1; |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 292 | } |
| 293 | |
Etienne Pierre-doray | aff4086 | 2021-09-14 17:31:51 +0000 | [diff] [blame] | 294 | void OffsetMapper::ForwardProjectAll(std::deque<offset_t>* offsets) const { |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 295 | DCHECK(std::is_sorted(offsets->begin(), offsets->end())); |
| 296 | auto current = equivalences_.begin(); |
| 297 | for (auto& src : *offsets) { |
| 298 | while (current != end() && current->src_end() <= src) { |
| 299 | ++current; |
| 300 | } |
| 301 | |
| 302 | if (current != end() && current->src_offset <= src) { |
| 303 | src = src - current->src_offset + current->dst_offset; |
| 304 | } else { |
| 305 | src = kInvalidOffset; |
| 306 | } |
| 307 | } |
Jdragon | 74d44ed | 2018-08-24 12:46:42 +0000 | [diff] [blame] | 308 | base::Erase(*offsets, kInvalidOffset); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 309 | offsets->shrink_to_fit(); |
| 310 | } |
| 311 | |
| 312 | void OffsetMapper::PruneEquivalencesAndSortBySource( |
Etienne Pierre-doray | 8bb965d | 2021-10-29 14:12:23 +0000 | [diff] [blame] | 313 | std::deque<Equivalence>* equivalences) { |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 314 | std::sort(equivalences->begin(), equivalences->end(), |
| 315 | [](const Equivalence& a, const Equivalence& b) { |
| 316 | return a.src_offset < b.src_offset; |
| 317 | }); |
| 318 | |
| 319 | for (auto current = equivalences->begin(); current != equivalences->end(); |
| 320 | ++current) { |
| 321 | // A "reaper" is an equivalence after |current| that overlaps with it, but |
| 322 | // is longer, and so truncates |current|. For example: |
| 323 | // ****** <= |current| |
| 324 | // ** |
| 325 | // **** |
| 326 | // **** |
| 327 | // ********** <= |next| as reaper. |
| 328 | // If a reaper is found (as |next|), every equivalence strictly between |
| 329 | // |current| and |next| would be truncated to 0 and discarded. Handling this |
| 330 | // case is important to avoid O(n^2) behavior. |
| 331 | bool next_is_reaper = false; |
| 332 | |
| 333 | // Look ahead to resolve overlaps, until a better candidate is found. |
| 334 | auto next = current + 1; |
| 335 | for (; next != equivalences->end(); ++next) { |
| 336 | DCHECK_GE(next->src_offset, current->src_offset); |
| 337 | if (next->src_offset >= current->src_end()) |
| 338 | break; // No more overlap. |
| 339 | |
| 340 | if (current->length < next->length) { |
| 341 | // |next| is better: So it is a reaper that shrinks |current|. |
| 342 | offset_t delta = current->src_end() - next->src_offset; |
| 343 | current->length -= delta; |
| 344 | next_is_reaper = true; |
| 345 | break; |
| 346 | } |
| 347 | } |
| 348 | |
| 349 | if (next_is_reaper) { |
| 350 | // Discard all equivalences strictly between |cur| and |next|. |
| 351 | for (auto reduced = current + 1; reduced != next; ++reduced) |
| 352 | reduced->length = 0; |
| 353 | current = next - 1; |
| 354 | } else { |
| 355 | // Shrink all equivalences that overlap with |current|. These are all |
| 356 | // worse than |current| since no reaper is found. |
| 357 | for (auto reduced = current + 1; reduced != next; ++reduced) { |
| 358 | offset_t delta = current->src_end() - reduced->src_offset; |
| 359 | reduced->length -= std::min(reduced->length, delta); |
| 360 | reduced->src_offset += delta; |
| 361 | reduced->dst_offset += delta; |
| 362 | DCHECK_EQ(reduced->src_offset, current->src_end()); |
| 363 | } |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | // Discard all equivalences with length == 0. |
Jdragon | 74d44ed | 2018-08-24 12:46:42 +0000 | [diff] [blame] | 368 | base::EraseIf(*equivalences, [](const Equivalence& equivalence) { |
| 369 | return equivalence.length == 0; |
| 370 | }); |
Etienne Pierre-doray | 8bb965d | 2021-10-29 14:12:23 +0000 | [diff] [blame] | 371 | equivalences->shrink_to_fit(); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 372 | } |
| 373 | |
| 374 | /******** EquivalenceMap ********/ |
| 375 | |
| 376 | EquivalenceMap::EquivalenceMap() = default; |
| 377 | |
| 378 | EquivalenceMap::EquivalenceMap(std::vector<EquivalenceCandidate>&& equivalences) |
| 379 | : candidates_(std::move(equivalences)) { |
| 380 | SortByDestination(); |
| 381 | } |
| 382 | |
| 383 | EquivalenceMap::EquivalenceMap(EquivalenceMap&&) = default; |
| 384 | |
| 385 | EquivalenceMap::~EquivalenceMap() = default; |
| 386 | |
| 387 | void EquivalenceMap::Build( |
| 388 | const std::vector<offset_t>& old_sa, |
| 389 | const EncodedView& old_view, |
| 390 | const EncodedView& new_view, |
| 391 | const std::vector<TargetsAffinity>& targets_affinities, |
| 392 | double min_similarity) { |
| 393 | DCHECK_EQ(old_sa.size(), old_view.size()); |
| 394 | |
| 395 | CreateCandidates(old_sa, old_view, new_view, targets_affinities, |
| 396 | min_similarity); |
| 397 | SortByDestination(); |
| 398 | Prune(old_view, new_view, targets_affinities, min_similarity); |
| 399 | |
| 400 | offset_t coverage = 0; |
| 401 | offset_t current_offset = 0; |
| 402 | for (auto candidate : candidates_) { |
| 403 | DCHECK_GE(candidate.eq.dst_offset, current_offset); |
| 404 | coverage += candidate.eq.length; |
| 405 | current_offset = candidate.eq.dst_end(); |
| 406 | } |
| 407 | LOG(INFO) << "Equivalence Count: " << size(); |
| 408 | LOG(INFO) << "Coverage / Extra / Total: " << coverage << " / " |
| 409 | << new_view.size() - coverage << " / " << new_view.size(); |
| 410 | } |
| 411 | |
| 412 | void EquivalenceMap::CreateCandidates( |
| 413 | const std::vector<offset_t>& old_sa, |
| 414 | const EncodedView& old_view, |
| 415 | const EncodedView& new_view, |
| 416 | const std::vector<TargetsAffinity>& targets_affinities, |
| 417 | double min_similarity) { |
| 418 | candidates_.clear(); |
| 419 | |
| 420 | // This is an heuristic to find 'good' equivalences on encoded views. |
| 421 | // Equivalences are found in ascending order of |new_image|. |
| 422 | offset_t dst_offset = 0; |
| 423 | |
| 424 | while (dst_offset < new_view.size()) { |
| 425 | if (!new_view.IsToken(dst_offset)) { |
| 426 | ++dst_offset; |
| 427 | continue; |
| 428 | } |
| 429 | auto match = |
| 430 | SuffixLowerBound(old_sa, old_view.begin(), |
| 431 | new_view.begin() + dst_offset, new_view.end()); |
| 432 | |
| 433 | offset_t next_dst_offset = dst_offset + 1; |
| 434 | // TODO(huangs): Clean up. |
| 435 | double best_similarity = min_similarity; |
Calder Kitagawa | 2ed3877 | 2018-06-28 15:32:16 +0000 | [diff] [blame] | 436 | uint64_t total_visit_length = 0; |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 437 | EquivalenceCandidate best_candidate = {{0, 0, 0}, 0.0}; |
| 438 | for (auto it = match; it != old_sa.end(); ++it) { |
| 439 | EquivalenceCandidate candidate = VisitEquivalenceSeed( |
| 440 | old_view.image_index(), new_view.image_index(), targets_affinities, |
| 441 | static_cast<offset_t>(*it), dst_offset, min_similarity); |
| 442 | if (candidate.similarity > best_similarity) { |
| 443 | best_candidate = candidate; |
| 444 | best_similarity = candidate.similarity; |
| 445 | next_dst_offset = candidate.eq.dst_end(); |
Calder Kitagawa | 2ed3877 | 2018-06-28 15:32:16 +0000 | [diff] [blame] | 446 | total_visit_length += candidate.eq.length; |
| 447 | if (total_visit_length > kSeedSelectionTotalVisitLengthQuota) { |
| 448 | break; |
| 449 | } |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 450 | } else { |
| 451 | break; |
| 452 | } |
| 453 | } |
Calder Kitagawa | 2ed3877 | 2018-06-28 15:32:16 +0000 | [diff] [blame] | 454 | total_visit_length = 0; |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 455 | for (auto it = match; it != old_sa.begin(); --it) { |
| 456 | EquivalenceCandidate candidate = VisitEquivalenceSeed( |
| 457 | old_view.image_index(), new_view.image_index(), targets_affinities, |
| 458 | static_cast<offset_t>(it[-1]), dst_offset, min_similarity); |
| 459 | if (candidate.similarity > best_similarity) { |
| 460 | best_candidate = candidate; |
| 461 | best_similarity = candidate.similarity; |
| 462 | next_dst_offset = candidate.eq.dst_end(); |
Calder Kitagawa | 2ed3877 | 2018-06-28 15:32:16 +0000 | [diff] [blame] | 463 | total_visit_length += candidate.eq.length; |
| 464 | if (total_visit_length > kSeedSelectionTotalVisitLengthQuota) { |
| 465 | break; |
| 466 | } |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 467 | } else { |
| 468 | break; |
| 469 | } |
| 470 | } |
| 471 | if (best_candidate.similarity >= min_similarity) { |
| 472 | candidates_.push_back(best_candidate); |
| 473 | } |
| 474 | |
| 475 | dst_offset = next_dst_offset; |
| 476 | } |
| 477 | } |
| 478 | |
| 479 | void EquivalenceMap::SortByDestination() { |
| 480 | std::sort(candidates_.begin(), candidates_.end(), |
| 481 | [](const EquivalenceCandidate& a, const EquivalenceCandidate& b) { |
| 482 | return a.eq.dst_offset < b.eq.dst_offset; |
| 483 | }); |
| 484 | } |
| 485 | |
| 486 | void EquivalenceMap::Prune( |
| 487 | const EncodedView& old_view, |
| 488 | const EncodedView& new_view, |
| 489 | const std::vector<TargetsAffinity>& target_affinities, |
| 490 | double min_similarity) { |
| 491 | // TODO(etiennep): unify with |
| 492 | // OffsetMapper::PruneEquivalencesAndSortBySource(). |
| 493 | for (auto current = candidates_.begin(); current != candidates_.end(); |
| 494 | ++current) { |
| 495 | if (current->similarity < min_similarity) |
| 496 | continue; // This candidate will be discarded anyways. |
| 497 | |
| 498 | bool next_is_reaper = false; |
| 499 | |
| 500 | // Look ahead to resolve overlaps, until a better candidate is found. |
| 501 | auto next = current + 1; |
| 502 | for (; next != candidates_.end(); ++next) { |
| 503 | DCHECK_GE(next->eq.dst_offset, current->eq.dst_offset); |
| 504 | if (next->eq.dst_offset >= current->eq.dst_offset + current->eq.length) |
| 505 | break; // No more overlap. |
| 506 | |
| 507 | if (current->similarity < next->similarity) { |
| 508 | // |next| is better: So it is a reaper that shrinks |current|. |
| 509 | offset_t delta = current->eq.dst_end() - next->eq.dst_offset; |
| 510 | current->eq.length -= delta; |
| 511 | current->similarity = GetEquivalenceSimilarity( |
| 512 | old_view.image_index(), new_view.image_index(), target_affinities, |
| 513 | current->eq); |
| 514 | |
| 515 | next_is_reaper = true; |
| 516 | break; |
| 517 | } |
| 518 | } |
| 519 | |
| 520 | if (next_is_reaper) { |
| 521 | // Discard all equivalences strictly between |cur| and |next|. |
| 522 | for (auto reduced = current + 1; reduced != next; ++reduced) { |
| 523 | reduced->eq.length = 0; |
| 524 | reduced->similarity = 0; |
| 525 | } |
| 526 | current = next - 1; |
| 527 | } else { |
| 528 | // Shrinks all overlapping candidates following and worse than |current|. |
| 529 | for (auto reduced = current + 1; reduced != next; ++reduced) { |
| 530 | offset_t delta = current->eq.dst_end() - reduced->eq.dst_offset; |
| 531 | reduced->eq.length -= std::min(reduced->eq.length, delta); |
| 532 | reduced->eq.src_offset += delta; |
| 533 | reduced->eq.dst_offset += delta; |
| 534 | reduced->similarity = GetEquivalenceSimilarity( |
| 535 | old_view.image_index(), new_view.image_index(), target_affinities, |
| 536 | reduced->eq); |
| 537 | DCHECK_EQ(reduced->eq.dst_offset, current->eq.dst_end()); |
| 538 | } |
| 539 | } |
| 540 | } |
| 541 | |
| 542 | // Discard all candidates with similarity smaller than |min_similarity|. |
Jdragon | 74d44ed | 2018-08-24 12:46:42 +0000 | [diff] [blame] | 543 | base::EraseIf(candidates_, |
| 544 | [min_similarity](const EquivalenceCandidate& candidate) { |
| 545 | return candidate.similarity < min_similarity; |
| 546 | }); |
Samuel Huang | 06f1ae9 | 2018-03-13 18:19:34 +0000 | [diff] [blame] | 547 | } |
| 548 | |
| 549 | } // namespace zucchini |