| #include "caffe2/core/memonger.h" |
| |
| #include <set> |
| #include <unordered_set> |
| |
| #include "caffe2/utils/proto_utils.h" |
| |
| namespace caffe2 { |
| |
| void run_schema_check(const NetDef& net) { |
| for (auto& op : net.op()) { |
| auto* schema = OpSchemaRegistry::Schema(op.type()); |
| if (schema) { |
| CAFFE_ENFORCE( |
| schema->Verify(op), |
| "Operator def did not pass schema checking: ", |
| ProtoDebugString(op)); |
| } |
| } |
| } |
| |
| namespace memonger { |
| |
| NetDef optimize_inference_net( |
| const NetDef& net, |
| const std::set<string>& static_blobs) { |
| if (net.type() != "" && net.type() != "simple") { |
| LOG(INFO) << "Cannot optimize memory for nets of type: " << net.type(); |
| return net; |
| } |
| |
| // Memonger modifies the graph. Do an early schema check here to make sure |
| // the operators are valid |
| run_schema_check(net); |
| |
| std::vector<OperatorDef> ops; |
| for (auto& op : net.op()) { |
| if (op.type() == "RecurrentNetwork") { |
| // NOTE: for subtleties of RNN op memonger, see memonger.py on how |
| // to deal with the forward/backward links etc. |
| LOG(INFO) << "Memonger does not support RecurrentNetwork yet"; |
| return net; |
| } |
| ops.push_back(op); |
| } |
| |
| // Step 1: count first and last operator for each blob |
| std::unordered_set<std::string> all_blobs; |
| std::unordered_map<std::string, std::pair<int, int>> ranges; |
| for (size_t i = 0; i < ops.size(); i++) { |
| for (auto& inp : ops[i].input()) { |
| if (ranges.find(inp) != ranges.end()) { |
| ranges[inp].second = i; |
| } |
| all_blobs.insert(inp); |
| } |
| for (auto& outp : ops[i].output()) { |
| all_blobs.insert(outp); |
| if (static_blobs.find(outp) != static_blobs.end()) { |
| continue; |
| } |
| if (ranges.find(outp) == ranges.end()) { |
| ranges[outp] = std::make_pair(i, i); |
| } |
| } |
| } |
| |
| // Step 2: pass over ops and recycle |
| std::vector<std::string> free_blobs; |
| std::unordered_map<std::string, std::string> renaming; |
| std::unordered_map<std::string, std::string> mapping; |
| |
| for (int i = 0; i < (int)ops.size(); i++) { |
| auto& op = ops[i]; |
| std::unordered_set<std::string> new_free_blobs; |
| |
| // Check if some input is used the last time, and release it |
| for (auto& inp : op.input()) { |
| auto rit = ranges.find(inp); |
| if (rit != ranges.end() && rit->second.second == i) { |
| if (mapping.find(inp) == mapping.end()) { |
| new_free_blobs.insert(inp); |
| mapping[inp] = inp; |
| |
| // Safety check to prevent double-memongering nets. |
| string shared_blob = |
| "__m" + c10::to_string(renaming.size()) + "_shared"; |
| if (all_blobs.find(shared_blob) != all_blobs.end()) { |
| LOG(INFO) << "Net was already memongered!"; |
| return net; |
| } |
| renaming[inp] = shared_blob; |
| } else { |
| new_free_blobs.insert(mapping[inp]); |
| } |
| } |
| } |
| |
| // Check if some output appears the first time, and see if we can replace it |
| // with a recycled blob. |
| for (auto& outp : op.output()) { |
| if (!free_blobs.empty()) { |
| // first use? |
| auto rit = ranges.find(outp); |
| if (rit != ranges.end() && rit->second.first == i) { |
| std::string recycled = free_blobs.back(); |
| free_blobs.pop_back(); |
| mapping[outp] = recycled; |
| } |
| } |
| } |
| |
| // Add blobs released from this op to the pool. |
| for (auto& b : new_free_blobs) { |
| free_blobs.push_back(b); |
| } |
| } |
| |
| // Step 3: rename inputs and outputs and create new net |
| NetDef optim_net = net; |
| optim_net.mutable_op()->Clear(); |
| for (auto op : ops) { |
| for (int i = 0; i < op.input_size(); i++) { |
| auto& inp = op.input(i); |
| if (mapping.find(inp) != mapping.end()) { |
| op.set_input(i, renaming[mapping[inp]]); |
| } |
| } |
| for (int i = 0; i < op.output_size(); i++) { |
| auto& outp = op.output(i); |
| if (mapping.find(outp) != mapping.end()) { |
| op.set_output(i, renaming[mapping[outp]]); |
| } |
| } |
| auto* ao = optim_net.add_op(); |
| ao->CopyFrom(op); |
| } |
| |
| VLOG(1) << "optimized net using " << renaming.size() << " shared blobs"; |
| return optim_net; |
| } |
| |
| class ComputeBlobRecyclingForDag { |
| public: |
| explicit ComputeBlobRecyclingForDag(const int size) |
| : op_inputs_(size), |
| op_visited_count_(size), |
| op_token_deposit_(size), |
| op_visited_(size, false) {} |
| NetDef OptimizeNet( |
| const NetDef& net, |
| const std::vector<string>& heads, |
| const std::vector<int>& op_indices, |
| const std::unordered_set<string>& shareable_blob_names, |
| const string& namescope, |
| const std::unordered_set<string>& dont_share_blob_names, |
| const std::unordered_map<string, vector<int>>& blob_shapes) { |
| |
| // Memonger modifies the graph. Do an early schema check here to make sure |
| // the operators are valid |
| run_schema_check(net); |
| // Construct the set of input blobs. |
| std::unordered_set<string> heads_blobs_set(heads.begin(), heads.end()); |
| |
| // Construct the set of output blobs we want to optimize. |
| // Blobs not eligible for sharing are filtered out |
| for (const int op_index : op_indices) { |
| for (const auto& output : net.op(op_index).output()) { |
| if (has_key(shareable_blob_names, output) && !has_key(dont_share_blob_names, output)) { |
| optim_op_outputs_.insert(output); |
| } |
| } |
| } |
| |
| // Compute operators in degree (op_inputs_) and initialize how many ops are |
| // sharing input blobs (share_counts_). |
| // Note: We have to handle the cases where output blobs are shared. |
| std::unordered_map<string, int> blob_seen; |
| for (const int op_index : op_indices) { |
| for (const auto& input : net.op(op_index).input()) { |
| if (has_key(shareable_blob_names, input) || |
| has_key(heads_blobs_set, input)) { |
| if (has_key(optim_op_outputs_, input)) { |
| CAFFE_ENFORCE( |
| blob_seen.find(input) != blob_seen.end(), |
| "Input ", |
| input, |
| " was not output by an op before"); |
| op_inputs_[op_index] += blob_seen[input]; |
| } else { |
| share_counts_[input] = 1; |
| } |
| blob_to_ops_[input].push_back(op_index); |
| } |
| } |
| for (const auto& output : net.op(op_index).output()) { |
| blob_seen[output] += 1; |
| blob_device_[output] = net.op(op_index).device_option(); |
| // Exception for CopyGPUToCPU that has |
| // cuda device option but whose inputs/outputs are on CPU |
| if (net.op(op_index).type() == "CopyGPUToCPU") { |
| blob_device_[output].set_device_type(0); |
| blob_device_[output].set_device_id(0); |
| } |
| } |
| } |
| |
| // The main recursive call. Here we do start DFS in the operator graph |
| // from the input blobs. Note that the input ordering does not indicate |
| // operator graph ordering. To avoid traversing children operators first, |
| // traversal begins from root ops and then recursively children ops are |
| // visited. |
| for (const auto& input_blob : heads) { |
| for (const int op_index : blob_to_ops_[input_blob]) { |
| if (!op_visited_[op_index] && !op_inputs_[op_index]) { |
| vector<std::pair<int, string>> free_blobs; |
| std::unordered_set<int> tokens{tokens_counter_++}; |
| process_op( |
| net, |
| shareable_blob_names, |
| namescope, |
| dont_share_blob_names, |
| blob_shapes, |
| op_index, |
| &free_blobs, |
| &tokens); |
| } |
| } |
| } |
| |
| // Rename mapped blobs. |
| std::unordered_map<string, string> renamed; |
| int name_idx = 0; |
| std::unordered_set<string> mapped_blobs_set; |
| for (const auto& mapped_blob : mapping_) { |
| mapped_blobs_set.insert(mapped_blob.second); |
| if (has_key(optim_op_outputs_, mapped_blob.second)) { |
| if (renamed.find(mapped_blob.second) == renamed.end()) { |
| renamed.insert( |
| {mapped_blob.second, |
| namescope + "__m" + c10::to_string(name_idx++) + "_shared"}); |
| } |
| } else { |
| renamed.insert({mapped_blob.second, mapped_blob.second}); |
| } |
| } |
| |
| // Recursively rename mapped_blobs. |
| mapping_.insert(renamed.begin(), renamed.end()); |
| bool had_changes = true; |
| while (had_changes) { |
| had_changes = false; |
| for (const auto& mapped_blob : mapping_) { |
| if (has_key(renamed, mapped_blob.second) && |
| renamed[mapped_blob.second] != mapped_blob.second) { |
| renamed[mapped_blob.first] = renamed[mapped_blob.second]; |
| mapping_[mapped_blob.first] = renamed[mapped_blob.first]; |
| } |
| } |
| } |
| |
| NetDef optimized_net = apply_assignments(net); |
| LOG(INFO) << "Remapping " << mapping_.size() << " using " |
| << mapped_blobs_set.size() << " shared blobs."; |
| if (floats_saved_ > 0) { |
| LOG(INFO) << "Memonger saved approximately : " |
| << (floats_saved_ * 4.0 / 1024.0 / 1024.0) << " MB."; |
| } |
| |
| return optimized_net; |
| } |
| |
| private: |
| NetDef apply_assignments(const NetDef& net) { |
| NetDef optimized_net = net; |
| // Rename optimized_net blobs. |
| for (int i = 0; i < optimized_net.op_size(); ++i) { |
| // Special handling for RNNs, which have internal nets that |
| // can refer to memongered blobs |
| if (optimized_net.op(i).type().find("RecurrentNetwork") == 0) { |
| apply_recurrent_blob_assignments(optimized_net.mutable_op(i)); |
| } |
| |
| // Special handling for AsyncIf ops, where internal nets can |
| // refer to memongered blobs |
| if (optimized_net.op(i).type() == "AsyncIf") { |
| apply_asyncif_blob_assignments(optimized_net.mutable_op(i)); |
| } |
| |
| for (int j = 0; j < optimized_net.op(i).input_size(); ++j) { |
| const string& input_name = |
| get_blob_or_mapped_blob(optimized_net.op(i).input(j)); |
| optimized_net.mutable_op(i)->set_input(j, input_name); |
| } |
| |
| for (int j = 0; j < optimized_net.op(i).output_size(); ++j) { |
| auto output_name = |
| get_blob_or_mapped_blob(optimized_net.op(i).output(j)); |
| optimized_net.mutable_op(i)->set_output(j, output_name); |
| } |
| } |
| return optimized_net; |
| } |
| |
| void apply_recurrent_blob_assignments(OperatorDef* op) { |
| // Recursively map stepnets in RecurrentNetworks, and |
| // attach a mapping table |
| for (int i = 0; i < op->arg_size(); i++) { |
| Argument* arg = op->mutable_arg(i); |
| const string& name = arg->name(); |
| if (name == "step_net" || name == "backward_step_net") { |
| if (arg->has_n()) { |
| NetDef* step_net_ref = arg->mutable_n(); |
| CAFFE_ENFORCE( |
| !arg->has_s(), |
| "Invalid definition for ", |
| name, |
| ". Only one of NetDef and string should be present"); |
| NetDef optimized_net = apply_assignments(*step_net_ref); |
| step_net_ref->CopyFrom(optimized_net); |
| } else { |
| NetDef step_net; |
| CAFFE_ENFORCE( |
| TextFormat::ParseFromString( |
| arg->s(), &step_net), |
| "Could not parse step net:", |
| name); |
| step_net = apply_assignments(step_net); |
| arg->set_s(ProtoDebugString(step_net)); |
| } |
| } |
| } |
| |
| // Store renamings |
| vector<string> inputs_outputs(op->input().begin(), op->input().end()); |
| inputs_outputs.insert( |
| inputs_outputs.end(), op->output().begin(), op->output().end()); |
| |
| for (auto& b : inputs_outputs) { |
| string mapped = get_blob_or_mapped_blob(b); |
| if (b != mapped) { |
| Argument* map_arg = op->add_arg(); |
| map_arg->set_name(b + ".rename"); |
| map_arg->set_s(mapped); |
| } |
| } |
| } |
| |
| void apply_asyncif_blob_assignments(OperatorDef* op) { |
| for (int i = 0; i < op->arg_size(); i++) { |
| Argument* arg = op->mutable_arg(i); |
| const string& name = arg->name(); |
| if (name == "then_net" || name == "else_net") { |
| NetDef* step_net_ref = arg->mutable_n(); |
| NetDef optimized_net = apply_assignments(*step_net_ref); |
| |
| // update external inputs and outputs mappings as well |
| // for this internal net |
| std::vector<string> optim_external_inputs; |
| for (auto& blob_name : optimized_net.external_input()) { |
| optim_external_inputs.push_back(get_blob_or_mapped_blob(blob_name)); |
| } |
| optimized_net.mutable_external_input()->Clear(); |
| for (const auto& blob_name : optim_external_inputs) { |
| optimized_net.add_external_input(blob_name); |
| } |
| |
| std::vector<string> optim_external_outputs; |
| for (auto& blob_name : optimized_net.external_output()) { |
| optim_external_outputs.push_back(get_blob_or_mapped_blob(blob_name)); |
| } |
| optimized_net.mutable_external_output()->Clear(); |
| for (const auto& blob_name : optim_external_outputs) { |
| optimized_net.add_external_output(blob_name); |
| } |
| |
| step_net_ref->CopyFrom(optimized_net); |
| } |
| } |
| } |
| |
| template <typename K, typename V> |
| inline bool has_key(const std::unordered_map<K, V>& in_map, const K& key) { |
| return in_map.find(key) != in_map.end(); |
| } |
| |
| template <typename K> |
| inline bool has_key(const std::unordered_set<K>& in_set, const K& key) { |
| return in_set.find(key) != in_set.end(); |
| } |
| |
| void process_op( |
| const NetDef& net, |
| const std::unordered_set<string>& shareable_blob_names, |
| const string& namescope, |
| const std::unordered_set<string>& dont_share_blob_names, |
| const std::unordered_map<string, vector<int>>& blob_shapes, |
| int op_index, |
| std::vector<std::pair<int, string>>* free_blobs, |
| std::unordered_set<int>* tokens) { |
| // The tokens we have now is the union of current tokens operator is holding |
| // and tokens pushed from parents. |
| tokens->insert( |
| op_token_deposit_[op_index].begin(), op_token_deposit_[op_index].end()); |
| op_token_deposit_[op_index].clear(); |
| CAFFE_ENFORCE(!op_visited_[op_index]); |
| op_visited_[op_index] = true; |
| |
| const OperatorDef& current_op = net.op(op_index); |
| |
| // The set of freed input blobs by processing current op. |
| std::vector<std::pair<int, string>> new_free_blobs; |
| std::unordered_set<string> new_free_blobs_set; |
| |
| // Now update blob tokens. |
| for (const auto& input : current_op.input()) { |
| const auto& actual_blob = get_blob_or_mapped_blob(input); |
| req_tokens_[actual_blob].insert(tokens->begin(), tokens->end()); |
| if (actual_blob != input) { |
| req_tokens_[input].insert(tokens->begin(), tokens->end()); |
| } |
| } |
| for (const auto& output : current_op.output()) { |
| const auto& actual_blob = get_blob_or_mapped_blob(output); |
| req_tokens_[actual_blob].insert(tokens->begin(), tokens->end()); |
| if (actual_blob != output) { |
| req_tokens_[output].insert(tokens->begin(), tokens->end()); |
| } |
| } |
| |
| // Increment blob count and check if we can free input blobs. |
| for (const auto& input : current_op.input()) { |
| if (has_key(shareable_blob_names, input)) { |
| blob_input_count_[input]++; |
| if (blob_input_count_[input] == (int)blob_to_ops_[input].size()) { |
| const string& actual_blob = get_blob_or_mapped_blob(input); |
| if (!has_key(dont_share_blob_names, actual_blob)) { |
| new_free_blobs.emplace_back( |
| -share_counts_[actual_blob], actual_blob); |
| new_free_blobs_set.insert(actual_blob); |
| } |
| } |
| } |
| } |
| |
| // Check if we can recycle free blobs and use it as output blob. |
| for (const auto& output : current_op.output()) { |
| if (has_key(shareable_blob_names, output) && |
| !has_key(processed_output_blobs_, output) && |
| !has_key(new_free_blobs_set, output)) { |
| const string freed_blob = get_free_blob( |
| output, blob_shapes, tokens, free_blobs, blob_device_[output]); |
| if (freed_blob != "") { |
| req_tokens_[freed_blob].insert(tokens->begin(), tokens->end()); |
| share_counts_[freed_blob]++; |
| mapping_[output] = freed_blob; |
| } |
| processed_output_blobs_.insert(output); |
| } |
| } |
| |
| // Insert new freed blobs. |
| std::unordered_set<string> free_blob_set; |
| for (const auto& free_blob : *free_blobs) { |
| free_blob_set.insert(free_blob.second); |
| } |
| for (const auto& new_free_blob : new_free_blobs) { |
| if (!has_key(free_blob_set, new_free_blob.second)) { |
| free_blobs->push_back(new_free_blob); |
| if (blob_shapes.size() > 0) { |
| if (!has_key(blob_sizes_, new_free_blob.second)) { |
| blob_sizes_.insert( |
| {new_free_blob.second, |
| infer_blob_size(new_free_blob.second, blob_shapes)}); |
| } |
| } |
| std::push_heap( |
| free_blobs->begin(), |
| free_blobs->end(), |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::greater<std::pair<int, string>>()); |
| } |
| } |
| |
| int num_branches = 0; |
| for (const auto& output : current_op.output()) { |
| num_branches += blob_to_ops_[output].size(); |
| } |
| |
| for (const auto& output : current_op.output()) { |
| for (const auto& input_op_index : blob_to_ops_[output]) { |
| op_visited_count_[input_op_index]++; |
| if (op_visited_count_[input_op_index] == op_inputs_[input_op_index]) { |
| std::unordered_set<int> new_tokens; |
| new_tokens.insert(tokens->begin(), tokens->end()); |
| if (num_branches > 1) { |
| new_tokens.insert(tokens_counter_++); |
| } |
| process_op( |
| net, |
| shareable_blob_names, |
| namescope, |
| dont_share_blob_names, |
| blob_shapes, |
| input_op_index, |
| free_blobs, |
| &new_tokens); |
| } else { |
| if (!op_visited_[input_op_index]) { |
| op_token_deposit_[input_op_index].insert( |
| tokens->begin(), tokens->end()); |
| } |
| } |
| } |
| } |
| } |
| |
| inline int infer_blob_size( |
| const string& blob_name, |
| const std::unordered_map<string, vector<int>>& blob_shapes) { |
| const auto& blob_shapes_iter = blob_shapes.find(blob_name); |
| if (blob_shapes_iter == blob_shapes.end()) { |
| return 0; |
| } |
| int size = 1; |
| // NOLINTNEXTLINE(modernize-loop-convert) |
| for (size_t i = 0; i < blob_shapes_iter->second.size(); ++i) { |
| size *= blob_shapes_iter->second[i]; |
| } |
| return size; |
| } |
| |
| inline string get_blob_or_mapped_blob(const string& blob_name) { |
| auto mapped_blob = mapping_.find(blob_name); |
| if (mapped_blob == mapping_.end()) { |
| return blob_name; |
| } else { |
| return mapped_blob->second; |
| } |
| } |
| |
| // Returns true if the op that generates that blob acquires all tokens. |
| inline bool can_use_blob( |
| const string& blob_name, |
| std::unordered_set<int>* tokens, |
| const DeviceOption& device_option) { |
| const DeviceOption& blob_device = blob_device_[blob_name]; |
| if (device_option.device_type() != blob_device.device_type() || |
| device_option.device_id() != blob_device.device_id()) { |
| return false; |
| } |
| for (const int token : req_tokens_[blob_name]) { |
| if (tokens->find(token) == tokens->end()) { |
| return false; |
| } |
| } |
| return true; |
| }; |
| |
| // Returns the name of the blob that we are going to map blob_name into. |
| inline string get_free_blob( |
| const string& blob_name, |
| const std::unordered_map<string, vector<int>>& blob_shapes, |
| std::unordered_set<int>* tokens, |
| std::vector<std::pair<int, string>>* free_blobs, |
| const DeviceOption& device) { |
| string freed_blob = ""; |
| if (blob_shapes.size() == 0) { |
| std::vector<std::pair<int, string>> cant_use_blobs; |
| while (free_blobs->size() > 0) { |
| std::pop_heap( |
| free_blobs->begin(), |
| free_blobs->end(), |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::greater<std::pair<int, string>>()); |
| const auto cand_free_blob = free_blobs->back(); |
| free_blobs->pop_back(); |
| if (can_use_blob(cand_free_blob.second, tokens, device)) { |
| freed_blob = cand_free_blob.second; |
| break; |
| } else { |
| cant_use_blobs.push_back(cand_free_blob); |
| } |
| } |
| for (const auto& cant_use_blob : cant_use_blobs) { |
| free_blobs->push_back(cant_use_blob); |
| std::push_heap( |
| free_blobs->begin(), |
| free_blobs->end(), |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::greater<std::pair<int, string>>()); |
| } |
| } else { |
| // Heuristic to choose the largest blob to fit output thats |
| // slightly less than blob_size. |
| const int blob_size = infer_blob_size(blob_name, blob_shapes); |
| int best_size = -1; |
| int free_blob_index = -1; |
| for (size_t i = 0; i < free_blobs->size(); ++i) { |
| const string& cb_name = (*free_blobs)[i].second; |
| if (can_use_blob(cb_name, tokens, device)) { |
| const int cand_bz = blob_sizes_[cb_name]; |
| CAFFE_ENFORCE(blob_sizes_.find(cb_name) != blob_sizes_.end()); |
| if (cand_bz >= best_size) { |
| if (best_size < blob_size || best_size >= cand_bz) { |
| best_size = cand_bz; |
| free_blob_index = i; |
| } |
| } |
| } |
| } |
| if (free_blob_index != -1) { |
| floats_saved_ += best_size; |
| freed_blob = (*free_blobs)[free_blob_index].second; |
| free_blobs->erase(free_blobs->begin() + free_blob_index); |
| } |
| } |
| return freed_blob; |
| }; |
| |
| int tokens_counter_ = 1; |
| int floats_saved_ = 0; |
| // blob_name -> Op edges. |
| std::unordered_map<string, std::vector<int>> blob_to_ops_; |
| // Current Op in degree. |
| std::unordered_map<string, int> blob_input_count_; |
| // Op in degree. |
| std::vector<int> op_inputs_; |
| // Current Op visit counts. |
| std::vector<int> op_visited_count_; |
| std::unordered_map<string, int> share_counts_; |
| std::unordered_map<string, int> blob_sizes_; |
| std::unordered_map<string, std::unordered_set<int>> req_tokens_; |
| std::vector<std::unordered_set<int>> op_token_deposit_; |
| std::unordered_set<string> optim_op_outputs_; |
| std::unordered_map<string, string> mapping_; |
| std::unordered_map<string, DeviceOption> blob_device_; |
| // The set of output blobs we already processed. |
| std::unordered_set<string> processed_output_blobs_; |
| std::vector<bool> op_visited_; |
| }; |
| |
| NetDef compute_blob_recycling_for_dag( |
| const NetDef& net, |
| const std::vector<string>& heads, |
| const std::vector<int>& op_indices, |
| const std::unordered_set<string>& shareable_blob_names, |
| const string& namescope, |
| const std::unordered_set<string>& dont_share_blob_names, |
| const std::unordered_map<string, vector<int>>& blob_shapes) { |
| ComputeBlobRecyclingForDag memonger(net.op_size()); |
| return memonger.OptimizeNet( |
| net, |
| heads, |
| op_indices, |
| shareable_blob_names, |
| namescope, |
| dont_share_blob_names, |
| blob_shapes); |
| } |
| |
| } // memonger |
| } // caffe2 |