/** * Copyright (c) 2016-present, Facebook, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "caffe2/core/common.h" #include "caffe2/core/db.h" #include "caffe2/core/init.h" #include "caffe2/core/logging.h" #include "caffe2/core/timer.h" #include "caffe2/proto/caffe2_pb.h" #include "caffe2/utils/proto_utils.h" #include "caffe2/utils/string_utils.h" #include "caffe2/utils/bench_utils.h" #include "binaries/benchmark_args.h" #include "binaries/benchmark_helper.h" #include #include #include C10_DEFINE_int( batch_size, -1, "Specify the batch size of the input. The number of items in the " "input needs to be multiples of the batch size. If the batch size " "is less than 0, all inputs are in one batch.") C10_DEFINE_bool(color, true, "If set, load images in color."); C10_DEFINE_string( crop, "-1,-1", "The center cropped hight and width. If the value is less than zero, " "it is not cropped."); C10_DEFINE_string(input_image_files, "", "Files containing imput images"); C10_DEFINE_string(input_text_files, "", "Text files to be written to blobs"); C10_DEFINE_string( preprocess, "", "Options to specify the preprocess routines. The available options are " "subtract128, normalize, mean, std, bgrtorgb. If multiple steps are provided, they " "are separated by comma (,) in sequence."); C10_DEFINE_string( report_time, "", "Report the conversion stage time to screen. " "The format of the string is |. " "The valid type is 'json'. " "The valid identifier is nothing or an identifier that prefix every line"); C10_DEFINE_string( scale, "-1,-1", "Scale the images to be within the min,max box. The shorter edge is " "min pixels. But if the other edge is more than the max pixels, the " "other edge and scaled to max pixels (and the shorter edge can be less " "than the min pixels"); C10_DEFINE_bool(warp, false, "If warp is set, warp the images to square."); namespace caffe2 { void reportTime( std::string type, double ts, std::string metric, std::string unit) { if (FLAGS_report_time == "") { return; } vector s = caffe2::split('|', FLAGS_report_time); assert(s[0] == "json"); std::string identifier = ""; if (s.size() > 1) { identifier = s[1]; } std::cout << identifier << "{\"type\": \"" << type << "\", \"value\": " << ts << ", \"metric\": \"" << metric << "\", \"unit\": \"" << unit << "\"}" << std::endl; } void splitSizes(const std::string& arg, int* ptr0, int* ptr1) { vector sizes = caffe2::split(',', arg); if (sizes.size() == 2) { *ptr0 = std::stoi(sizes[0]); *ptr1 = std::stoi(sizes[1]); } else if (sizes.size() == 1) { *ptr0 = std::stoi(sizes[0]); *ptr1 = std::stoi(sizes[0]); } else { assert(false); } } cv::Mat resizeImage(cv::Mat& img) { int min_size, max_size; splitSizes(FLAGS_scale, &min_size, &max_size); if ((min_size <= 0) && (max_size <= 0)) { return img; } if (max_size < 0) { max_size = INT_MAX; } assert(min_size <= max_size); int im_min_size = img.rows > img.cols ? img.cols : img.rows; int im_max_size = img.rows > img.cols ? img.rows : img.cols; double im_scale = 1.0 * min_size / im_min_size; if (im_scale * im_max_size > max_size) { im_scale = 1.0 * max_size / im_max_size; } int scaled_width = int(round(img.cols * im_scale)); int scaled_height = int(round(img.rows * im_scale)); assert((scaled_width <= max_size) && (scaled_height <= max_size)); if ((scaled_width < min_size) || (scaled_height < min_size)) { assert((scaled_width == max_size) || (scaled_height == max_size)); } else { assert((scaled_width == min_size) || (scaled_height == min_size)); } cv::Mat resized_img; cv::resize( img, resized_img, cv::Size(), im_scale, im_scale, cv::INTER_LINEAR); return resized_img; } cv::Mat cropToRec(cv::Mat& img, int* height_ptr, int* width_ptr) { int height = *height_ptr; int width = *width_ptr; if ((height > 0) && (width > 0) && ((img.rows != height) || (img.cols != width))) { cv::Mat cropped_img, cimg; cv::Rect roi; roi.x = int((img.cols - width) / 2); roi.y = int((img.rows - height) / 2); roi.x = roi.x < 0 ? 0 : roi.x; roi.y = roi.y < 0 ? 0 : roi.y; width = width > img.cols ? img.cols : width; height = height > img.rows ? img.rows : height; roi.width = width; roi.height = height; assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= img.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= img.rows); cropped_img = img(roi); // Make the image in continuous space in memory cimg = cropped_img.clone(); *height_ptr = height; *width_ptr = width; return cimg; } else { return img; } } std::vector convertToVector(cv::Mat& img) { std::vector normalize(3, 1); std::vector mean(3, 0); std::vector std(3, 1); bool bgrtorgb = false; int size = img.cols * img.rows; vector steps = caffe2::split(',', FLAGS_preprocess); for (int i = 0; i < steps.size(); i++) { auto step = steps[i]; if (step == "subtract128") { mean = {128, 128, 128}; std = {1, 1, 1}; normalize = {1, 1, 1}; } else if (step == "normalize") { normalize = {255, 255, 255}; } else if (step == "mean") { mean = {0.406f, 0.456f, 0.485f}; } else if (step == "std") { std = {0.225f, 0.224f, 0.229f}; } else if (step == "bgrtorgb") { bgrtorgb = true; } else { CAFFE_ENFORCE( false, "Unsupported preprocess step. The supported steps are: subtract128, " "normalize,mean, std, swaprb."); } } int C = FLAGS_color ? 3 : 1; int total_size = C * size; std::vector values(total_size); if (C == 1) { cv::MatIterator_ it, end; int idx = 0; for (it = img.begin(), end = img.end(); it != end; ++it) { values[idx++] = (*it / normalize[0] - mean[0]) / std[0]; } } else { int i = 0; cv::MatIterator_ it, end; int b = bgrtorgb ? 2 : 0; int g = 1; int r = bgrtorgb ? 0 : 2; for (it = img.begin(), end = img.end(); it != end; ++it, i++) { values[i] = (((*it)[b] / normalize[0] - mean[0]) / std[0]); int offset = size + i; values[offset] = (((*it)[g] / normalize[1] - mean[1]) / std[1]); offset = size + offset; values[offset] = (((*it)[r] / normalize[2] - mean[2]) / std[2]); } } return values; } std::vector convertOneImage( std::string& filename, int* height_ptr, int* width_ptr) { assert(filename[0] != '~'); std::cout << "Converting " << filename << std::endl; // Load image cv::Mat img_uint8 = cv::imread( #if CV_MAJOR_VERSION <= 3 filename, FLAGS_color ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE); #else filename, FLAGS_color ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE); #endif caffe2::Timer timer; timer.Start(); cv::Mat img; // Convert image to floating point values img_uint8.convertTo(img, CV_32F); // Resize image cv::Mat resized_img = resizeImage(img); int height, width; splitSizes(FLAGS_crop, &height, &width); if ((height <= 0) || (width <= 0)) { height = resized_img.rows; width = resized_img.cols; } cv::Mat crop = cropToRec(resized_img, &height, &width); // Assert we don't have to deal with alignment DCHECK(crop.isContinuous()); assert(crop.rows == height); assert(crop.cols == width); std::vector one_image_values = convertToVector(crop); *height_ptr = height; *width_ptr = width; double ts = timer.MicroSeconds(); reportTime("image_preprocess", ts, "convert", "us"); return one_image_values; } int getBatchSize(int num_items) { int batch_size = FLAGS_batch_size; if (batch_size < 0) { batch_size = num_items; } else { assert(num_items % batch_size == 0); } return batch_size; } TensorProtos writeValues( std::vector>>& values, std::vector>& dims) { caffe2::Timer timer; timer.Start(); assert(dims.size() == values.size()); int num_batches = dims.size(); TensorProtos protos; for (int k = 0; k < num_batches; k++) { TensorProto* data; data = protos.add_protos(); data->set_data_type(TensorProto::FLOAT); auto one_dim = dims[k]; for (int dim : one_dim) { data->add_dims(dim); } int batch_size = one_dim[0]; long long int entry_size = 1; for (int i = 1; i < one_dim.size(); i++) { entry_size *= one_dim[i]; } // Not optimized for (int i = 0; i < batch_size; i++) { assert(values[k][i].size() == entry_size); for (int j = 0; j < values[k][i].size(); j++) { data->add_float_data(values[k][i][j]); } } } double ts = timer.MicroSeconds(); reportTime("preprocess", ts, "data_pack", "us"); return protos; } TensorProtos convertImages(std::string& image_file) { vector file_names; if (image_file != "") { std::ifstream infile(image_file); std::string line; while (std::getline(infile, line)) { vector file_name = caffe2::split(',', line); string name; if (file_name.size() == 3) { name = file_name[2]; } else { name = line; } file_names.push_back(name); } } else { TensorProtos proto; return proto; } int batch_size = getBatchSize(file_names.size()); int num_batches = file_names.size() / batch_size; assert(file_names.size() == batch_size * num_batches); std::vector>> values; std::vector> dims; int C = FLAGS_color ? 3 : 1; for (int k = 0; k < num_batches; k++) { std::vector> one_value; int height = -1; int width = -1; for (int i = 0; i < batch_size; i++) { int idx = k * batch_size + i; int one_height, one_width; std::vector one_image_values = convertOneImage(file_names[idx], &one_height, &one_width); if (height < 0 && width < 0) { height = one_height; width = one_width; } else { assert(height == one_height); assert(width == one_width); } one_value.push_back(one_image_values); } vector one_dim = {batch_size, C, height, width}; dims.push_back(one_dim); values.push_back(one_value); } return writeValues(values, dims); } template vector splitString(std::string& line) { vector vector_str = caffe2::split(',', line); vector vector_int; for (string str : vector_str) { vector_int.push_back((TYPE)std::stod(str)); } return vector_int; } /* Convert the values in a json file to blobs The format of the json file should be: , .... (dimensions of items) , , ... (all entries in one item) , , ... .... */ TensorProtos convertValues(std::string& file_name) { if (file_name == "") { TensorProtos proto; return proto; } std::ifstream infile(file_name); std::string line; std::getline(infile, line); vector file_dims = splitString (line); assert(file_dims.size() >= 2); int num_items = file_dims[0]; int batch_size = getBatchSize(num_items); int num_batches = num_items / batch_size; assert(num_items == batch_size * num_batches); vector lines; while (std::getline(infile, line)) { lines.push_back(line); } assert(lines.size() == num_items); std::vector>> values; std::vector> dims; for (int i = 0; i < num_batches; i++) { std::vector> one_value; int num = -1; for (int j = 0; j < batch_size; j++) { int idx = i * batch_size + j; std::string line = lines[idx]; vector item = splitString(line); if (num < 0) { num = item.size(); } else { assert(num == item.size()); } one_value.push_back(item); } vector batch_dims = file_dims; batch_dims[0] = batch_size; dims.push_back(batch_dims); values.push_back(one_value); } return writeValues(values, dims); } } // namespace caffe2 void observerConfig() { caffe2::ClearGlobalNetObservers(); caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) { return std::make_unique(subject); }); caffe2::ObserverConfig::setReporter( std::make_unique()); } bool backendCudaSet(const string& backend) { bool run_on_gpu = false; if (backend == "cuda") { #ifdef __CUDA_ARCH__ if (caffe2::HasCudaGPU()) { run_on_gpu = true; } else { CAFFE_THROW("NO GPU support on this host machine"); } #else CAFFE_THROW("NO GPU support"); #endif } return run_on_gpu; } void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) { if (backend != "builtin") { string engine; if( backend == "nnpack" ) { engine = "NNPACK"; } else if ( backend == "eigen" ) { engine = "EIGEN"; } else if ( backend == "mkl" ) { engine = "MKLDNN"; } else if ( backend == "cuda" ) { engine = "CUDA"; } else if ( backend == "dnnlowp" ) { engine = "DNNLOWP"; } else if ( backend == "dnnlowp_acc16" ) { engine = "DNNLOWP_ACC16"; } else if ( backend == "default" ) { engine = ""; } else { engine = "NONE"; } CAFFE_ENFORCE(engine != "NONE", "Backend is not supported"); for (int i = 0; i < net_def->op_size(); i++) { caffe2::OperatorDef* op_def = net_def->mutable_op(i); op_def->set_engine(engine); } } } void fillInputBlob( shared_ptr workspace, map& tensor_protos_map, int iteration) { if (tensor_protos_map.empty()) { return; } static caffe2::TensorDeserializer deserializer; for (auto& tensor_kv : tensor_protos_map) { caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first); if (blob == nullptr) { blob = workspace->CreateBlob(tensor_kv.first); } // todo: support gpu and make this function a template int protos_size = tensor_kv.second.protos_size(); if (protos_size == 1 && iteration > 0) { // Do not override the input data if there is only one input data, // since it will clear all caches. Rely on wipe_cache to // clear caches continue; } caffe2::TensorProto* tensor_proto = tensor_kv.second.mutable_protos(iteration % protos_size); BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto)); // todo: for other types } } void writeOutput( shared_ptr workspace, const bool run_on_gpu, const string& output, const string& output_folder, const bool text_output, const int index, const int num_blobs) { if (output.size() == 0) { return; } string output_prefix = output_folder.size() ? output_folder + "/" : ""; vector output_names = caffe2::split(',', output); if (output == "*") { output_names = workspace->Blobs(); } for (const string& name : output_names) { CAFFE_ENFORCE( workspace->HasBlob(name), "You requested a non-existing blob: ", name); if (text_output) { if (run_on_gpu) { #ifdef __CUDA_ARCH__ writeTextOutput( workspace->GetBlob(name)->GetMutable(), output_prefix, name, index, num_blobs); #else CAFFE_THROW("Not support GPU."); #endif } else { writeTextOutput( BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU), output_prefix, name, index, num_blobs); } } else { // Do not support multiple entries per blob. CAFFE_ENFORCE( index == 0, "Binary file only support one output."); string serialized = SerializeBlob(*workspace->GetBlob(name), name); string output_filename = output_prefix + name; caffe2::WriteStringToFile(serialized, output_filename.c_str()); } } } void runNetwork( shared_ptr workspace, caffe2::NetDef& net_def, map& tensor_protos_map, const bool wipe_cache, const bool run_individual, const bool run_on_gpu, const bool text_output, const int warmup, const int iter, const int num_blobs, const int sleep_before_run, const int sleep_between_iteration, const int sleep_between_net_and_operator, const std::string& output, const std::string& output_folder) { if (!net_def.has_name()) { net_def.set_name("benchmark"); } caffe2::NetBase* net = workspace->CreateNet(net_def); TORCH_CHECK_NOTNULL(net); LOG(INFO) << "Starting benchmark."; caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup); LOG(INFO) << "Running warmup runs."; for (int i = 0; i < warmup; ++i) { fillInputBlob(workspace, tensor_protos_map, i); CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed."); } if (wipe_cache) { caffe2::wipe_cache(); } if (sleep_before_run > 0) { std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run)); } LOG(INFO) << "Main runs."; CAFFE_ENFORCE( iter >= 0, "Number of main runs should be non negative, provided ", iter, "."); LOG(INFO) << "net runs."; for (int i = 0; i < iter; ++i) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup); fillInputBlob(workspace, tensor_protos_map, i); if (wipe_cache) { caffe2::wipe_cache(); } CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed."); // Write the output for the first num_blobs times writeOutput( workspace, run_on_gpu, output, output_folder, text_output, i, num_blobs); if (wipe_cache) { caffe2::wipe_cache(); } if (sleep_between_iteration > 0) { std::this_thread::sleep_for( std::chrono::seconds(sleep_between_iteration)); } } if (run_individual) { LOG(INFO) << "operator runs."; if (sleep_between_net_and_operator > 0) { std::this_thread::sleep_for( std::chrono::seconds(sleep_between_net_and_operator)); } for (int i = 0; i < iter; ++i) { caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup); fillInputBlob(workspace, tensor_protos_map, i); CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed."); if (wipe_cache) { caffe2::wipe_cache(); } if (sleep_between_iteration > 0) { std::this_thread::sleep_for( std::chrono::seconds(sleep_between_iteration)); } } } } int benchmark( int argc, char* argv[], const string& FLAGS_backend, const string& FLAGS_init_net, const string& FLAGS_input_dims, int FLAGS_iter, const string& FLAGS_net, const string& FLAGS_output, const string& FLAGS_output_folder, bool FLAGS_run_individual, int FLAGS_sleep_before_run, int FLAGS_sleep_between_iteration, int FLAGS_sleep_between_net_and_operator, bool FLAGS_text_output, int FLAGS_warmup, bool FLAGS_wipe_cache) { // Check arguments to be correct { // Need to check whether file exists, as the file reader does not assert if // file does not exist std::ifstream net_file(FLAGS_net); CAFFE_ENFORCE(net_file.good()); net_file.close(); std::ifstream init_net_file(FLAGS_init_net); CAFFE_ENFORCE(init_net_file.good()); init_net_file.close(); } observerConfig(); caffe2::ShowLogInfoToStderr(); auto workspace = std::make_shared(new caffe2::Workspace()); bool run_on_gpu = backendCudaSet(FLAGS_backend); // Run initialization network. caffe2::NetDef init_net_def; CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); setOperatorEngine(&init_net_def, FLAGS_backend); CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); // Run main network. caffe2::NetDef net_def; CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); setOperatorEngine(&net_def, FLAGS_backend); map tensor_protos_map; int num_blobs; vector images = caffe2::split(';', FLAGS_input_image_files); for (int i = 0; i < images.size(); ++i) { vector mapping = caffe2::split(',', images[i]); caffe2::TensorProtos proto_images = caffe2::convertImages(mapping[1]); workspace->CreateBlob(mapping[0]); tensor_protos_map.insert(std::make_pair(mapping[0], proto_images)); num_blobs = proto_images.protos_size(); } vector values = caffe2::split(';', FLAGS_input_text_files); for (int i = 0; i < values.size(); ++i) { vector mapping = caffe2::split(',', values[i]); caffe2::TensorProtos proto_values = caffe2::convertValues(mapping[1]); workspace->CreateBlob(mapping[0]); tensor_protos_map.insert(std::make_pair(mapping[0], proto_values)); num_blobs = proto_values.protos_size(); } runNetwork( workspace, net_def, tensor_protos_map, FLAGS_wipe_cache, FLAGS_run_individual, run_on_gpu, FLAGS_text_output, FLAGS_warmup, FLAGS_iter, num_blobs, FLAGS_sleep_before_run, FLAGS_sleep_between_iteration, FLAGS_sleep_between_net_and_operator, FLAGS_output, FLAGS_output_folder); return 0; } int main(int argc, char** argv) { caffe2::GlobalInit(&argc, &argv); benchmark( argc, argv, FLAGS_backend, FLAGS_init_net, FLAGS_input_dims, FLAGS_iter, FLAGS_net, FLAGS_output, FLAGS_output_folder, FLAGS_run_individual, FLAGS_sleep_before_run, FLAGS_sleep_between_iteration, FLAGS_sleep_between_net_and_operator, FLAGS_text_output, FLAGS_warmup, FLAGS_wipe_cache); return 0; }