#include "caffe2/core/net_simple.h" #include "caffe2/core/net.h" #include #include #include #include #include "caffe2/core/operator.h" #include "caffe2/core/static_tracepoint.h" #include "caffe2/core/timer.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/proto_utils.h" C10_DEFINE_bool( caffe2_simple_net_benchmark_run_whole_net, true, "If false, whole net passes won't be performed"); namespace caffe2 { SimpleNet::SimpleNet( const std::shared_ptr& net_def, Workspace* ws) : NetBase(net_def, ws) { VLOG(1) << "Constructing SimpleNet " << net_def->name(); const bool net_def_has_device_option = net_def->has_device_option(); // Initialize the operators for (int idx = 0; idx < net_def->op_size(); ++idx) { const auto& operator_def = net_def->op(idx); VLOG(1) << "Creating operator " << operator_def.name() << ": " << operator_def.type(); std::unique_ptr op{nullptr}; if (!operator_def.has_device_option() && net_def_has_device_option) { // In the case that the operator def does not specify a device option but // the net def has a default option, we copy the device option over to the // operator def. OperatorDef temp_def(operator_def); temp_def.mutable_device_option()->CopyFrom(net_def->device_option()); op = CreateOperator(temp_def, ws, idx); } else { op = CreateOperator(operator_def, ws, idx); op->set_debug_def( std::shared_ptr{net_def, &(net_def->op(idx))}); } operators_.emplace_back(std::move(op)); } } bool SimpleNet::Run() { StartAllObservers(); VLOG(1) << "Running net " << name_; for (auto& op : operators_) { VLOG(1) << "Running operator " << op->debug_def().name() << "(" << op->debug_def().type() << ")."; #ifdef CAFFE2_ENABLE_SDT const auto& op_name = op->debug_def().name().c_str(); const auto& op_type = op->debug_def().type().c_str(); auto* op_ptr = op.get(); const auto& net_name = name_.c_str(); CAFFE_SDT(operator_start, net_name, op_name, op_type, op_ptr); #endif bool res = op->Run(); #ifdef CAFFE2_ENABLE_SDT CAFFE_SDT(operator_done, net_name, op_name, op_type, op_ptr); #endif if (!res) { LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def()); return false; } } StopAllObservers(); return true; } bool SimpleNet::RunAsync() { return Run(); } namespace { template bool PairLargerThan(const std::pair& x, const std::pair& y) { return x.second > y.second; } } vector SimpleNet::TEST_Benchmark( const int warmup_runs, const int main_runs, const bool run_individual) { /* Use std::cout because logging may be disabled */ std::cout << "Starting benchmark." << std::endl; std::cout << "Running warmup runs." << std::endl; CAFFE_ENFORCE( warmup_runs >= 0, "Number of warm up runs should be non negative, provided ", warmup_runs, "."); for (int i = 0; i < warmup_runs; ++i) { CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed."); } std::cout << "Main runs." << std::endl; CAFFE_ENFORCE( main_runs >= 0, "Number of main runs should be non negative, provided ", main_runs, "."); Timer timer; auto millis = timer.MilliSeconds(); if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) { for (int i = 0; i < main_runs; ++i) { CAFFE_ENFORCE(Run(), "Main run ", i, " has failed."); } millis = timer.MilliSeconds(); std::cout << "Main run finished. Milliseconds per iter: " << millis / main_runs << ". Iters per second: " << 1000.0 * main_runs / millis << std::endl; } vector time_per_op(operators_.size(), 0); vector flops_per_op; vector memory_bytes_read_per_op; vector memory_bytes_written_per_op; vector param_bytes_per_op; CaffeMap time_per_op_type; CaffeMap flops_per_op_type; CaffeMap memory_bytes_read_per_op_type; CaffeMap memory_bytes_written_per_op_type; CaffeMap param_bytes_per_op_type; if (run_individual) { for (int i = 0; i < main_runs; ++i) { for (auto& op : operators_) { op->ResetEvent(); } int idx = 0; for (auto& op : operators_) { const string& op_type = op->debug_def().type(); if (i == 0) { // Gather flops on the first run. auto* schema = OpSchemaRegistry::Schema(op_type); if (schema && schema->HasCostInferenceFunction()) { vector shapes = op->InputTensorShapes(); auto all_good_shapes = std::accumulate( shapes.begin(), shapes.end(), true, [](bool acc, const TensorShape& shape) { return acc && !shape.unknown_shape(); }); OpSchema::Cost cost; if (all_good_shapes) { cost = schema->InferCost(op->debug_def(), shapes); } flops_per_op.emplace_back(cost.flops); memory_bytes_read_per_op.emplace_back(cost.bytes_read); memory_bytes_written_per_op.emplace_back(cost.bytes_written); param_bytes_per_op.emplace_back(cost.params_bytes); flops_per_op_type[op_type] += cost.flops; memory_bytes_read_per_op_type[op_type] += cost.bytes_read; memory_bytes_written_per_op_type[op_type] += cost.bytes_written; param_bytes_per_op_type[op_type] += cost.params_bytes; } else { flops_per_op.emplace_back(0); memory_bytes_read_per_op.emplace_back(0); memory_bytes_written_per_op.emplace_back(0); param_bytes_per_op.emplace_back(0); } } timer.Start(); CAFFE_ENFORCE( op->Run(), "operator ", op->debug_def().name(), "(", op_type, ") has failed."); float spent = timer.MilliSeconds(); time_per_op[idx] += spent; time_per_op_type[op_type] += spent; ++idx; } } size_t idx = 0; for (auto& op : operators_) { const string& op_type = op->debug_def().type(); const string& print_name = (op->debug_def().name().size() ? op->debug_def().name() : (op->debug_def().output_size() ? op->debug_def().output(0) : "NO_OUTPUT")); std::stringstream flops_str; if (idx < flops_per_op.size() && flops_per_op[idx]) { flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, " << to_string( 1.0e-6 * flops_per_op[idx] / time_per_op[idx] * main_runs) << " GFLOPS)"; } std::stringstream memory_bytes_read_str; if (idx < memory_bytes_read_per_op.size() && memory_bytes_read_per_op[idx]) { memory_bytes_read_str << " (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx]) << " MB)"; } std::stringstream memory_bytes_written_str; if (idx < memory_bytes_written_per_op.size() && memory_bytes_written_per_op[idx]) { memory_bytes_written_str << " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx]) << " MB)"; } std::stringstream param_bytes_str; if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) { param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx]) << " MB)"; } std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type << ") " << time_per_op[idx] / main_runs << " ms/iter" << flops_str.str() << memory_bytes_written_str.str() << param_bytes_str.str() << std::endl; ++idx; } const std::vector metric({"Time", "FLOP", "Feature Memory Read", "Feature Memory Written", "Parameter Memory"}); const std::vector normalizer( {1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6}); const std::vector unit({"ms", "GFLOP", "MB", "MB", "MB"}); std::vector*> metric_per_op_type_vec_vec; metric_per_op_type_vec_vec.emplace_back(&time_per_op_type); metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type); metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type); metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type); metric_per_op_type_vec_vec.emplace_back(¶m_bytes_per_op_type); for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) { std::cout << metric[i] << " per operator type:" << std::endl; auto* item = metric_per_op_type_vec_vec[i]; std::vector> metric_per_op_type_vec( (*item).begin(), (*item).end()); std::sort( metric_per_op_type_vec.begin(), metric_per_op_type_vec.end(), PairLargerThan); float total_metric = 0.; for (const auto& op_item : metric_per_op_type_vec) { total_metric += op_item.second * normalizer[i]; } for (const auto& op_item : metric_per_op_type_vec) { float percent = 0.; if (total_metric > 0.) { percent = (100.0 * op_item.second * normalizer[i] / total_metric); } std::cout << std::setw(15) << std::setfill(' ') << op_item.second * normalizer[i] << " " << unit[i] << ". " << std::setw(10) << std::setfill(' ') << percent << "%. " << op_item.first << std::endl; } std::cout << std::setw(15) << std::setfill(' ') << total_metric << " " << unit[i] << " in Total" << std::endl; } } // We will reuse time_per_op to return the result of BenchmarkNet. for (size_t i = 0; i < time_per_op.size(); ++i) { time_per_op[i] /= main_runs; } if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) { time_per_op.insert(time_per_op.begin(), millis / main_runs); } return time_per_op; } REGISTER_NET(simple, SimpleNet); } // namespace caffe2