mirror of
https://github.com/zebrajr/tensorflow.git
synced 2025-12-07 12:20:24 +01:00
Refactor the document and some polishment
PiperOrigin-RevId: 158083952
This commit is contained in:
parent
134138299e
commit
9d12c629c0
|
|
@ -1,45 +1,63 @@
|
|||
# tfprof: A Profiling Tool for TensorFlow Models
|
||||
# tfprof: TensorFlow Profiling Tool
|
||||
|
||||
Author: Xin Pan (xpan@google.com, github: panyx0718), Jon Shlens, Yao Zhang
|
||||
###Features
|
||||
|
||||
Consultants: Jon Shlens, Pete Warden
|
||||
* Profile model architectures
|
||||
* parameters, tensor shapes, float operations, device placement, etc.
|
||||
* Profile model performance
|
||||
* execution time, memory consumption
|
||||
* Profile multiple steps.
|
||||
|
||||
###Interfaces
|
||||
|
||||
###Major Features
|
||||
* Python API
|
||||
* Command Line
|
||||
* Visualization
|
||||
* C++ API (Not public, contact us if needed.)
|
||||
|
||||
1. Measure model parameters, float operations, tensor shapes.
|
||||
2. Profile op execution times, requested memory size and device placement.
|
||||
3. Inspect checkpoint tensors' shapes and their values.
|
||||
4. Selectively group, filter, account and order ops.
|
||||
### Views and Options
|
||||
|
||||
####tfprof supports 4 views to organize TensorFlow model profiles
|
||||
tfprof provides 4 different views to organize the statistics.
|
||||
|
||||
* code view: graph nodes are grouped by Python codes that generate them.
|
||||
* op view: graph nodes are grouped by operation type (E.g. MatMul, Conv2D) of the graph nodes.
|
||||
* scope view: graph nodes are organized based on name scope hierarchies.
|
||||
* graph view: graph nodes are organized based on op input/output.
|
||||
* code view: operations are grouped by Python codes that generate them.
|
||||
* op view: operations are grouped by operation type (E.g. MatMul, Conv2D).
|
||||
* scope view: operations are organized based on name scope hierarchies.
|
||||
* graph view: operations are organized based on input/output.
|
||||
|
||||
####For each view, there are 3 ways to display outputs:
|
||||
tfprof provides options to help user select, filter and order statistics.
|
||||
See [Options](g3doc/options.md) for detail instructions.
|
||||
|
||||
* stdout: Results are written to stdout.
|
||||
* timeline: Visualized in chrome browser as time series.
|
||||
* file: Results are dumped to file.
|
||||
```
|
||||
-max_depth 10
|
||||
-min_bytes 0
|
||||
-min_micros 0
|
||||
-min_params 0
|
||||
-min_float_ops 0
|
||||
-min_occurrence 0
|
||||
-step -1
|
||||
-order_by name
|
||||
-account_type_regexes .*
|
||||
-start_name_regexes .*
|
||||
-trim_name_regexes
|
||||
-show_name_regexes .*
|
||||
-hide_name_regexes
|
||||
-account_displayed_op_only false
|
||||
-select params
|
||||
-output stdout:
|
||||
```
|
||||
|
||||
###Tutorials
|
||||
|
||||
[Demo](#demo)
|
||||
|
||||
[Python API Tutorials](#python-api-tutorials): How to use directly from
|
||||
Python codes.
|
||||
|
||||
[CLI Tutorials](#cli-tutorials): How to run from interactive command line.
|
||||
|
||||
[Options](#options):
|
||||
tfprof supports many options to selectively account/display/order ops and
|
||||
statistics.
|
||||
|
||||
* [Python API](g3doc/python_api.md)
|
||||
* [Command Line Interface](g3doc/command_line.md)
|
||||
* [Profile Time](g3doc/profile_time.md)
|
||||
* [Profile Memory](g3doc/profile_memory.md)
|
||||
* [Profile Model Architecture](g3doc/profile_model_architecture.md)
|
||||
* [Options](g3doc/options.md)
|
||||
|
||||
## Demo
|
||||
### Attribute the TensorFlow graph running time to your Python codes.
|
||||
|
||||
### Attribute TensorFlow graph running time to your Python codes.
|
||||
```shell
|
||||
tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
|
||||
_TFProfRoot (0us/22.44ms)
|
||||
|
|
@ -95,540 +113,8 @@ Sigmoid 152.57MB (85.28%, 0.21%), 96.66ms (23.46%,
|
|||

|
||||
</left>
|
||||
|
||||
## Python API Tutorials
|
||||
### Teams
|
||||
|
||||
tfprof is part of TensorFlow core. Simply ```import tensorflow as tf```.
|
||||
|
||||
### Examine the shapes and sizes of all trainable Variables.
|
||||
```python
|
||||
# Print trainable variable parameter statistics to stdout.
|
||||
# By default, statistics are associated with each graph node.
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.
|
||||
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
|
||||
|
||||
|
||||
# Set tfprof_cmd='code' to associate statistics with Python codes.
|
||||
opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
|
||||
opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_cmd='code'
|
||||
tfprof_options=opts)
|
||||
|
||||
# param_stats is tensorflow.tfprof.TFGraphNodeProto proto.
|
||||
# Let's print the root below.
|
||||
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
|
||||
```
|
||||
|
||||
### Examine the number of floating point operations
|
||||
``` python
|
||||
# Print to stdout an analysis of the number of floating point operations in the
|
||||
# model broken down by individual operations.
|
||||
#
|
||||
# Note: Only Ops with RegisterStatistics('flops') defined have flop stats. It
|
||||
# also requires complete shape information. It is common that shape is unknown
|
||||
# statically. To complete the shape, provide run-time shape information with
|
||||
# tf.RunMetadata to the API (See next example on how to provide RunMetadata).
|
||||
#
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
|
||||
```
|
||||
|
||||
### Examine the timing and memory usage
|
||||
You will first need to run the following set up in your model in order to
|
||||
compute the memory and timing statistics.
|
||||
|
||||
```python
|
||||
# Generate the meta information for the model that contains the memory usage
|
||||
# and timing information.
|
||||
#
|
||||
# Note: When run on GPU, a kernel is first scheduled (enqueued) and then
|
||||
# executed asynchronously. tfprof only tracks the execution time.
|
||||
# In addition, a substantial of time might be spent between Python and
|
||||
# TensorFlow runtime, which is also not tracked by tfprof.
|
||||
#
|
||||
run_metadata = tf.RunMetadata()
|
||||
with tf.Session() as sess:
|
||||
_ = sess.run(train_op,
|
||||
options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_metadata)
|
||||
```
|
||||
|
||||
Finally, you may run `print_model_analysis` to explore the timing and memory
|
||||
demands of the model.
|
||||
|
||||
``` python
|
||||
# See model_analyzer_test.py for more examples.
|
||||
#
|
||||
# Print to stdout an analysis of the memory usage and the timing information
|
||||
# broken down by python codes.
|
||||
opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
|
||||
opts['show_name_regexes'] = ['.*my_code.py.*']
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
run_meta=run_metadata,
|
||||
tfprof_cmd='code',
|
||||
tfprof_options=opts)
|
||||
|
||||
# Print to stdout an analysis of the memory usage and the timing information
|
||||
# broken down by operations.
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
run_meta=run_metadata,
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
|
||||
```
|
||||
|
||||
### Visualize
|
||||
|
||||
```
|
||||
For example set opts['output'] = 'timeline:outfile=<filename>' to
|
||||
generate a timeline json file. Open a Chrome Browser, open URL
|
||||
chrome://tracing, and load the json file. Below are 2 examples of graph
|
||||
view and scope view. See code view example in later examples.
|
||||
```
|
||||
|
||||
<left>
|
||||

|
||||

|
||||
</left>
|
||||
|
||||
|
||||
## CLI Tutorials
|
||||
|
||||
Tutorials below are based on a 32 layers ResNet.
|
||||
|
||||
TODO(xpan): Provide graph.pbtxt, model.ckpt, tfprof_log and run_meta download.
|
||||
|
||||
### Examples
|
||||
|
||||
1) Start `tfprof` command line tool
|
||||
|
||||
```shell
|
||||
# Build the tool.
|
||||
bazel build --config opt tensorflow/tools/tfprof/...
|
||||
|
||||
# Help information, including detail 'option' instructions.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof help
|
||||
#
|
||||
# The following commands will start tfprof interactive mode.
|
||||
#
|
||||
# Profile model shapes and parameters only.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt
|
||||
#
|
||||
# Additionally profile ops requested memory and timing.
|
||||
# See CLI Input Files section on generating run_meta file.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
#
|
||||
# Additionally profile checkpoint statistics and values.
|
||||
# Use '-account_type_regexes _checkpoint_variables' to select
|
||||
# checkpoint tensors.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
--checkpoint_path=model.ckpt
|
||||
#
|
||||
# tfprof_log is used to define customized op types, float ops and code traces.
|
||||
# Use tfprof_logger.write_op_log() to create tfprof_log.
|
||||
# See 12) in Examples section on generating tfprof_log file.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
--op_log_path=tfprof_log \
|
||||
--checkpoint_path=model.ckpt
|
||||
#
|
||||
# The following command start tfprof in one-shot mode.
|
||||
#
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof scope \
|
||||
--graph_path=graph.pbtxt \
|
||||
--max_depth=3
|
||||
```
|
||||
Note that `graph.pbtxt` is an ASCII text format.
|
||||
|
||||
2) Press enter to show the default options
|
||||
|
||||
```shell
|
||||
tfprof>
|
||||
-max_depth 4
|
||||
-min_bytes 0
|
||||
-min_micros 0
|
||||
-min_params 0
|
||||
-min_float_ops 0
|
||||
-min_occurrence 0
|
||||
-step -1
|
||||
-order_by name
|
||||
-account_type_regexes Variable,VariableV2
|
||||
-start_name_regexes .*
|
||||
-trim_name_regexes
|
||||
-show_name_regexes .*
|
||||
-hide_name_regexes IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
|
||||
-account_displayed_op_only false
|
||||
# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
|
||||
# [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types]
|
||||
-select params
|
||||
# format: output_type:key=value,key=value...
|
||||
# output_types: stdout (default), timeline, file.
|
||||
# key=value pairs:
|
||||
# 1. timeline: outfile=<filename>
|
||||
# 2. file: outfile=<filename>
|
||||
# 3. stdout: None.
|
||||
# E.g. timeline:outfile=/tmp/timeline.json
|
||||
-output
|
||||
```
|
||||
|
||||
3) I want to see which line of my python codes costs most time!
|
||||
|
||||
```shell
|
||||
# Requires --graph_path --op_log_path
|
||||
tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
|
||||
_TFProfRoot (0us/22.44ms)
|
||||
model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
|
||||
model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
|
||||
model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
|
||||
model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
|
||||
model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
|
||||
model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
|
||||
model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
|
||||
model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
|
||||
model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
|
||||
...
|
||||
model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
|
||||
model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
|
||||
model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
|
||||
```
|
||||
|
||||
Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
|
||||
<left>
|
||||

|
||||
</left>
|
||||
|
||||
|
||||
4) I want to see the `BatchNorm`'s gamma value in checkpoint.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --checkpoint_path.
|
||||
tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
|
||||
_TFProfRoot ()
|
||||
unit_1_0/shared_activation/init_bn/gamma ()
|
||||
[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
|
||||
unit_1_0/sub2/bn2/gamma ()
|
||||
[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
|
||||
```
|
||||
|
||||
5) I want to see my checkpoint tensors shape and number of parameters.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --checkpoint_path.
|
||||
# Increase -max_depth to see all tensors.
|
||||
tfprof> scope -account_type_regexes _checkpoint_variables -select params -max_depth 4
|
||||
_TFProfRoot (--/930.58k params)
|
||||
global_step (0/0 params)
|
||||
init/init_conv/DW (3x3x3x16, 432/864 params)
|
||||
pool_logit/DW (64x10, 640/1.28k params)
|
||||
pool_logit/DW/Momentum (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/20 params)
|
||||
pool_logit/biases/Momentum (10, 10/10 params)
|
||||
unit_last/final_bn/beta (64, 64/128 params)
|
||||
unit_last/final_bn/gamma (64, 64/128 params)
|
||||
unit_last/final_bn/moving_mean (64, 64/64 params)
|
||||
unit_last/final_bn/moving_variance (64, 64/64 params)
|
||||
```
|
||||
|
||||
6) I defined an op named ‘cost’ to calculate the loss. I want to know what ops
|
||||
it depends on take a long time to run. Hint: Use the ‘graph’ command to explore
|
||||
graph dependencies.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --run_meta_path.
|
||||
tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
|
||||
_TFProfRoot (0us/3.61sec)
|
||||
init/init_conv/Conv2D (11.75ms/3.10sec)
|
||||
random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
|
||||
unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
|
||||
unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
|
||||
unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
|
||||
unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
|
||||
```
|
||||
|
||||
7) I want to know the expensive operations during the back propagation.
|
||||
Hint: tensorflow prepend ‘gradient’ to your defined name scopes. Use the ‘scope’
|
||||
command to explore based on name scope hierarchies.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --run_meta_path.
|
||||
tfprof> scope -start_name_regexes gradient.* -max_depth 100 -min_micros 20000 -select micros -account_type_regexes .*
|
||||
_TFProfRoot (0us/2.29sec)
|
||||
gradients/unit_1_0/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (54.96ms/54.96ms)
|
||||
gradients/unit_1_0/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (83.63ms/83.63ms)
|
||||
gradients/unit_1_1/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (99.25ms/99.25ms)
|
||||
gradients/unit_1_2/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.40ms/95.40ms)
|
||||
gradients/unit_1_2/sub2/conv2/Conv2D_grad/Conv2DBackpropFilter (99.83ms/99.83ms)
|
||||
gradients/unit_1_3/sub1/conv1/Conv2D_grad/Conv2DBackpropFilter (95.39ms/95.39ms)
|
||||
...
|
||||
```
|
||||
|
||||
8) Show the number of float operations in the model.
|
||||
Note: float operations calculation depends on
|
||||
1) op.RegisterStatistics. If an op doesn’t
|
||||
have RegisterStatistics defined, its float operations cannot be counted.
|
||||
2) fully defined shape is also necessary in order to calculate flops. Sometimes
|
||||
full shape is not available statically. Use RunMetadata to get run-time shape.
|
||||
float operations number is provided by tensorflow::tfprof::OpLog logged from
|
||||
Python API.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path, --op_log_path.
|
||||
tfprof> scope -min_float_ops 1 -max_depth 10 -select float_ops -account_type_regexes .*
|
||||
_TFProfRoot (0/17.63b flops)
|
||||
gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
|
||||
gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
|
||||
init/init_conv/Conv2D (113.25m/113.25m flops)
|
||||
pool_logit/xw_plus_b (1.28k/165.12k flops)
|
||||
pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
|
||||
unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
|
||||
unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
|
||||
unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
|
||||
unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
|
||||
...
|
||||
```
|
||||
|
||||
9) Show the number of parameters of all `tf.trainable_variables()` in the model.
|
||||
|
||||
```shell
|
||||
# Requires --graph_path --op_log_path.
|
||||
# store option for future commands.
|
||||
tfprof> set -account_type_regexes _trainable_variables
|
||||
tfprof> scope -max_depth 4 -select params
|
||||
_TFProfRoot (--/464.15k params)
|
||||
init/init_conv/DW (3x3x3x16, 432/432 params)
|
||||
pool_logit/DW (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/10 params)
|
||||
unit_last/final_bn/beta (64, 64/64 params)
|
||||
unit_last/final_bn/gamma (64, 64/64 params)
|
||||
```
|
||||
|
||||
Where does “_trainable_variables” come from? It is from the OpLog file
|
||||
generated by write_op_log() Python API. write_op_log() help users create some
|
||||
common op types implicitly. Users can define their own op types and log it
|
||||
through the write_op_log() API.
|
||||
|
||||
109) What if I’m lazy and don’t want to define op type? I have given my ops
|
||||
well-defined names in my model’s code. And want to use names to select a group
|
||||
of ops. Let’s try it!
|
||||
|
||||
```shell
|
||||
tfprof> set -account_type_regexes .*
|
||||
tfprof> scope -show_name_regexes unit_2_1.*DW -max_depth 100 -account_displayed_op_only
|
||||
_TFProfRoot (0/18.43k params)
|
||||
unit_2_1/sub1/conv1/DW (3x3x32x32, 9.22k/9.22k params)
|
||||
unit_2_1/sub2/conv2/DW (3x3x32x32, 9.22k/9.22k params)
|
||||
```
|
||||
|
||||
The above command allows you to filter ops that match specific names.
|
||||
`-account_displayed_op_only` asks tfprof to only account ops displayed
|
||||
in terminal. Otherwise, tfprof accounts all ops matched by
|
||||
`-account_type_regexes` recursively even if they are hidden due to some
|
||||
options such as -max_depth.
|
||||
|
||||
11) TensorFlow has built-in op types. For example, built-in op type `Variable`
|
||||
seems to include `Variable's` created by your model. However, be careful when
|
||||
depending on it because TensorFlow creates extra `Variable` ops implicitly and
|
||||
the implicitly created ops can have the same prefix as the `Variable's` you
|
||||
defined.
|
||||
|
||||
In the following example, extra `Variables` are created and “/Momentum” is
|
||||
appended to their names. This might cause you “model capacity” calculation
|
||||
to get wrong.
|
||||
|
||||
```shell
|
||||
tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
|
||||
_TFProfRoot (--/930.58k params)
|
||||
global_step (1/1 params)
|
||||
init/init_conv/DW (3x3x3x16, 432/864 params)
|
||||
pool_logit/DW (64x10, 640/1.28k params)
|
||||
pool_logit/DW/Momentum (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/20 params)
|
||||
pool_logit/biases/Momentum (10, 10/10 params)
|
||||
unit_last/final_bn/beta (64, 64/128 params)
|
||||
unit_last/final_bn/gamma (64, 64/128 params)
|
||||
unit_last/final_bn/moving_mean (64, 64/64 params)
|
||||
unit_last/final_bn/moving_variance (64, 64/64 params)
|
||||
```
|
||||
|
||||
|
||||
12) A example of defining extra op type for ops using `OpLog`
|
||||
|
||||
First, in Python code, create an `OpLog` proto and add op type
|
||||
information to it:
|
||||
|
||||
```python
|
||||
|
||||
op_log = tfprof_log_pb2.OpLog()
|
||||
entry = op_log.log_entries.add()
|
||||
entry.name = 'pool_logit/DW'
|
||||
entry.types.append('pool_logit')
|
||||
entry = op_log.log_entries.add()
|
||||
entry.name = 'pool_logit/biases'
|
||||
# Alternatively:
|
||||
# var = tf.get_variable(xxx)
|
||||
# entry.name = var.op.name
|
||||
entry.types.append('pool_logit')
|
||||
```
|
||||
|
||||
Second, call write_op_log to write the OpLog proto.
|
||||
|
||||
```python
|
||||
tf.contrib.tfprof.tfprof_logger.write_op_log(
|
||||
sess.graph, /tmp/my_op_log_dir, op_log)
|
||||
|
||||
# Get run-time shape information in order to fill shapes and get flops.
|
||||
tf.contrib.tfprof.tfprof_logger.write_op_log(
|
||||
sess.graph, /tmp/my_op_log_dir, op_log, run_meta)
|
||||
```
|
||||
|
||||
Third, when starting the tfprof tool, specify
|
||||
"--op_log_path /tmp/my_op_log_dir/op_log"
|
||||
|
||||
```shell
|
||||
tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
|
||||
_TFProfRoot (--/650 params)
|
||||
pool_logit/DW (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/10 params)
|
||||
```
|
||||
|
||||
Note that when you call
|
||||
`tf.contrib.tfprof.tfprof_logger.write_op_log(...)`,
|
||||
the tool adds all `Variables` inside `tf.trainable_variables()` to
|
||||
`_trainable_variables`.
|
||||
|
||||
12) Run tfprof in one-shot mode and dump result to file.
|
||||
|
||||
```shell
|
||||
# By default output to stdout. Use -output option to change output types.
|
||||
tfprof scope --graph_path=graph.pbtxt \
|
||||
--max_depth=3 \
|
||||
--output="file:outfile=/tmp/dump"
|
||||
Reading Files...
|
||||
Parsing GraphDef...
|
||||
Preparing Views...
|
||||
|
||||
cat /tmp/dump
|
||||
_TFProfRoot (--/930.58k params)
|
||||
global_step (0/0 params)
|
||||
pool_logit/DW (64x10, 640/1.28k params)
|
||||
pool_logit/biases (10, 10/20 params)
|
||||
```
|
||||
|
||||
13) Analyze how balanced Variable are on parameter servers.
|
||||
|
||||
In this tutorial, I'm going to use a seq2seq model, which are split
|
||||
on several gpus at workers and several parameter servers.
|
||||
|
||||
In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
|
||||
gpu0. They share an op_type called 'gpu0'.
|
||||
|
||||
```shell
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path ~/tfprof/textsum/graph.pbtxt \
|
||||
--run_meta_path ~/tfprof/textsum/run_meta
|
||||
|
||||
# Looks like ps task 1 is holding twice more parameters than task 0.
|
||||
tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
|
||||
_TFProfRoot (--/25.81m params)
|
||||
tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
|
||||
_TFProfRoot (--/58.84m params)
|
||||
```
|
||||
|
||||
### CLI Input Files
|
||||
|
||||
tfprof command line inference (CLI) loads dumped files from a tensorflow model.
|
||||
Convert them into in-memory data structures. To use it, users need to specify
|
||||
the locations of the dumped files. The following are the dumped files loaded
|
||||
by tfprof:
|
||||
|
||||
<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
|
||||
representation of the model. For example, graph.pbtxt written by tf.Supervisor
|
||||
is a candidate. If you are not using tf.Supervisor, you can easily get GraphDef
|
||||
using tf.Graph.as_graph_def() or other API.
|
||||
|
||||
<b>--run_meta_path:</b> tensorflow::RunMetadata.
|
||||
Used to get the memory and time consumption of
|
||||
each op of the model. Users need to enable it. For example, the following code
|
||||
snippet writes a RunMetadata file:
|
||||
|
||||
```python
|
||||
run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
|
||||
run_metadata = config_pb2.RunMetadata()
|
||||
# Once a while, call it the get the RunMeta.
|
||||
_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
|
||||
with gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
|
||||
f.write(run_metadata.SerializeToString())
|
||||
```
|
||||
|
||||
<b>--op_log_path:</b>
|
||||
tensorflow::tfprof::OpLog. A proto used to provide extra op information
|
||||
for ops. By giving a group of ops a type name, users can easily aggregate the
|
||||
statistics for those ops without accidentally missing or including extra ops.
|
||||
tfprof exposes the following Python API to add op information and logging.
|
||||
|
||||
```python
|
||||
tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
|
||||
```
|
||||
|
||||
<b>--checkpoint_path:</b>
|
||||
TensorFlow checkpoint. It defines _checkpoint_variable op type. It also
|
||||
provides checkpointed tensors' values.
|
||||
|
||||
|
||||
##Options
|
||||
|
||||
`-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure.
|
||||
|
||||
`-min_bytes`: Show ops that request at least this number of bytes.
|
||||
|
||||
`-min_micros`: Show ops that spend at least this number of microseconds to run.
|
||||
|
||||
`-min_params`: Show ops that contains at least this number of parameters.
|
||||
|
||||
`-min_float_ops`: Show ops that contain at least this number of float operations. Only available if an op has op.RegisterStatistics() defined and OpLog is provided
|
||||
|
||||
`-min_occurrence`: Show ops that appear at least this number of times. Only available in "op" view.
|
||||
|
||||
`-step`: Show the stats of the this step when multiple steps of RunMetadata were added. By default, show the average of all steps."
|
||||
|
||||
`-order_by`: Order the results by [name|depth|bytes|micros|params|float_ops|occurrence]
|
||||
|
||||
`-account_type_regexes`: Account and display the ops whose types match one of the type regexes specified. tfprof allow user to define extra op types for ops through tensorflow.tfprof.OpLog proto. regexes are comma-sperated.
|
||||
|
||||
`-start_name_regexes`: Show ops starting from the ops that matches the regexes, recursively. regexes are comma-separated.
|
||||
|
||||
`-trim_name_regexes`: Hide ops starting from the ops that matches the regexes, recursively, regexes are comma-seprated.
|
||||
|
||||
`-show_name_regexes`: Show ops that match the regexes. regexes are comma-seprated.
|
||||
|
||||
`-hide_name_regexes`: Hide ops that match the regexes. regexes are comma-seprated.
|
||||
|
||||
Notes: For each op, `-account_type_regexes` is first evaluated, only ops with
|
||||
types matching the specified regexes are accounted and selected for displayed.
|
||||
`-start/trim/show/hide_name_regexes` are used to further filter ops for display.
|
||||
`-start_name_regexes` is evaluated first to search the starting ops to display.
|
||||
Descendants of starting ops are then evaluated against `-show/hide_name_regexes`
|
||||
to make display decision. If an op matches trim_name_regexes, all its
|
||||
descendants are hidden. Ops statistics are *accounted even if they are hidden*
|
||||
as long as they match the `-account_xxx` options.
|
||||
|
||||
`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
|
||||
|
||||
`-select`: Comma-separated list of metrics to show: [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types].
|
||||
|
||||
`-output`: Output results as stdout, file or timeline.
|
||||
The format is ```output_type:key=value,key=value```.
|
||||
For example: ```timeline:outfile=<filename>```.
|
||||
timeline: key=outfile, value=<filename>.
|
||||
stdout: none.
|
||||
file: key=outfile, value=<filename>.
|
||||
* Xin Pan (xpan@google.com, github: panyx0718)
|
||||
* Jon Shlens
|
||||
* Yao Zhang
|
||||
|
|
|
|||
316
tensorflow/tools/tfprof/g3doc/command_line.md
Normal file
316
tensorflow/tools/tfprof/g3doc/command_line.md
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
## Command Line Interface Tutorials
|
||||
|
||||
* [Command Line Inputs](#command-line-inputs)
|
||||
* [Start `tfprof`](#start-tfprof)
|
||||
* [Examples](#examples)
|
||||
* [Profile Python Time](#profile-python-time)
|
||||
* [Profile Graph Time](#profile-graph-time)
|
||||
* [Profile Checkpoint Value](#profile-checkpoint-value)
|
||||
* [Profile Model Parameter](#profile-model-parameter)
|
||||
* [Profile Device Placement](#profile-device-placement)
|
||||
* [Define Customized Operation Type](#define-customized-operation-type)
|
||||
* [Non-interactive Mode](#non-interactive-mode)
|
||||
|
||||
|
||||
### Command Line Inputs
|
||||
|
||||
tfprof command line tool uses the following inputs:
|
||||
|
||||
<b>--graph_path:</b> GraphDef text file (required). Used to build in-memory
|
||||
architecture of the model. For example, graph.pbtxt written by tf.Supervisor
|
||||
can be passed to --graph_path. You can also easily get GraphDef using
|
||||
tf.get_default_graph().as_graph_def(add_shapes=True) or other API.
|
||||
|
||||
<b>--run_meta_path:</b> tensorflow::RunMetadata (optional).
|
||||
Used to get the memory consumption and execution time of
|
||||
each op of the model.
|
||||
|
||||
The following code snippet writes a RunMetadata file:
|
||||
|
||||
```python
|
||||
run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
|
||||
run_metadata = config_pb2.RunMetadata()
|
||||
_ = self._sess.run(..., options=run_options, run_metadata=run_metadata)
|
||||
with tf.gfile.Open(os.path.join(output_dir, "run_meta"), "w") as f:
|
||||
f.write(run_metadata.SerializeToString())
|
||||
```
|
||||
|
||||
<b>--op_log_path:</b>
|
||||
tensorflow::tfprof::OpLog (optional). A proto used to provide extra operation
|
||||
information. 1) float operations. 2) code traces. 3) define customized operation
|
||||
type for -account_type_regexes option.
|
||||
|
||||
The following code snippet writes a OpLog file.
|
||||
|
||||
```python
|
||||
tf.contrib.tfprof.tfprof_logger.write_op_log(graph, log_dir, op_log=None)
|
||||
```
|
||||
|
||||
<b>--checkpoint_path:</b> TensorFlow checkpoint (optional).
|
||||
It defines _checkpoint_variable op type. It also provides checkpointed tensors' values.
|
||||
|
||||
|
||||
###Start `tfprof`
|
||||
|
||||
#### Build `tfprof`
|
||||
|
||||
```shell
|
||||
# Build the tool.
|
||||
bazel build --config opt tensorflow/tools/tfprof/...
|
||||
|
||||
# Help information, including detail 'option' instructions.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof help
|
||||
```
|
||||
|
||||
#### Start `tfprof` Interactive Mode
|
||||
```shell
|
||||
# The following commands will start tfprof interactive mode.
|
||||
#
|
||||
# --graph_path contains the model architecutre and tensor shapes.
|
||||
# --run_meta_path contains the memory and time information.
|
||||
# --op_log_path contains float operation and code traces.
|
||||
# --checkpoint_path contains the model checkpoint data.
|
||||
#
|
||||
# Only includes model architecture, parameters and shapes.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt
|
||||
#
|
||||
# Additionally profile ops memory and timing.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
#
|
||||
# tfprof_log is used to define customized op types, float ops and code traces.
|
||||
# Use tfprof_logger.write_op_log() to create tfprof_log.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
--op_log_path=tfprof_log \
|
||||
#
|
||||
# Additionally profile checkpoint statistics and values.
|
||||
# Use '-account_type_regexes _checkpoint_variables' to select
|
||||
# checkpoint tensors.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=graph.pbtxt \
|
||||
--run_meta_path=run_meta \
|
||||
--op_log_path=tfprof_log \
|
||||
--checkpoint_path=model.ckpt
|
||||
```
|
||||
|
||||
#### Start `tfprof` Non-interactive Mode.
|
||||
|
||||
```python
|
||||
# Runs tfprof in one-shot.
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof scope \
|
||||
--graph_path=graph.pbtxt \
|
||||
--max_depth=3
|
||||
```
|
||||
|
||||
#### Press enter to show the default options
|
||||
|
||||
Refer to [Options](options.md) for option instructions.
|
||||
|
||||
```shell
|
||||
tfprof>
|
||||
-max_depth 4
|
||||
-min_bytes 0
|
||||
-min_micros 0
|
||||
-min_params 0
|
||||
-min_float_ops 0
|
||||
-min_occurrence 0
|
||||
-step -1
|
||||
-order_by name
|
||||
-account_type_regexes Variable,VariableV2
|
||||
-start_name_regexes .*
|
||||
-trim_name_regexes
|
||||
-show_name_regexes .*
|
||||
-hide_name_regexes IsVariableInitialized_[0-9]+,save\/.*,^zeros[0-9_]*
|
||||
-account_displayed_op_only false
|
||||
# supported select fileds. Availability depends on --[run_meta|checkpoint|op_log]_path.
|
||||
# [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types]
|
||||
-select params
|
||||
# format: output_type:key=value,key=value...
|
||||
# output_types: stdout (default), timeline, file.
|
||||
# key=value pairs:
|
||||
# 1. timeline: outfile=<filename>
|
||||
# 2. file: outfile=<filename>
|
||||
# 3. stdout: None.
|
||||
# E.g. timeline:outfile=/tmp/timeline.json
|
||||
-output
|
||||
```
|
||||
|
||||
###Examples
|
||||
|
||||
####Profile Python Time
|
||||
```shell
|
||||
# Requires --graph_path --op_log_path
|
||||
tfprof> code -max_depth 1000 -show_name_regexes .*model_analyzer.*py.* -select micros -account_type_regexes .* -order_by micros
|
||||
_TFProfRoot (0us/22.44ms)
|
||||
model_analyzer_test.py:149:run_filename_as_m...:none (0us/22.44ms)
|
||||
model_analyzer_test.py:33:_run_code_in_main:none (0us/22.44ms)
|
||||
model_analyzer_test.py:208:<module>:test.main() (0us/22.44ms)
|
||||
model_analyzer_test.py:132:testComplexCodeView:x = lib.BuildFull... (0us/22.44ms)
|
||||
model_analyzer_testlib.py:63:BuildFullModel:return sgd_op.min... (0us/21.83ms)
|
||||
model_analyzer_testlib.py:58:BuildFullModel:cell, array_ops.c... (0us/333us)
|
||||
model_analyzer_testlib.py:54:BuildFullModel:seq.append(array_... (0us/254us)
|
||||
model_analyzer_testlib.py:42:BuildSmallModel:x = nn_ops.conv2d... (0us/134us)
|
||||
model_analyzer_testlib.py:46:BuildSmallModel:initializer=init_... (0us/40us)
|
||||
...
|
||||
model_analyzer_testlib.py:61:BuildFullModel:loss = nn_ops.l2_... (0us/28us)
|
||||
model_analyzer_testlib.py:60:BuildFullModel:target = array_op... (0us/0us)
|
||||
model_analyzer_test.py:134:testComplexCodeView:sess.run(variable... (0us/0us)
|
||||
```
|
||||
|
||||
Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
|
||||
<left>
|
||||

|
||||
</left>
|
||||
|
||||
#### Profile Graph Time
|
||||
|
||||
```shell
|
||||
# I defined an op named ‘cost’ to calculate the loss. I want to know what ops
|
||||
# it depends on take a long time to run.
|
||||
|
||||
# Requires --graph_path, --run_meta_path.
|
||||
tfprof> graph -start_name_regexes cost.* -max_depth 100 -min_micros 10000 -select micros -account_type_regexes .*
|
||||
_TFProfRoot (0us/3.61sec)
|
||||
init/init_conv/Conv2D (11.75ms/3.10sec)
|
||||
random_shuffle_queue_DequeueMany (3.09sec/3.09sec)
|
||||
unit_1_0/sub2/conv2/Conv2D (74.14ms/3.19sec)
|
||||
unit_1_3/sub2/conv2/Conv2D (60.75ms/3.34sec)
|
||||
unit_2_4/sub2/conv2/Conv2D (73.58ms/3.54sec)
|
||||
unit_3_3/sub2/conv2/Conv2D (10.26ms/3.60sec)
|
||||
```
|
||||
|
||||
#### Profile Checkpoint Value
|
||||
```shell
|
||||
# Requires --graph_path, --checkpoint_path.
|
||||
tfprof> scope -show_name_regexes unit_1_0.*gamma -select tensor_value -max_depth 5
|
||||
_TFProfRoot ()
|
||||
unit_1_0/shared_activation/init_bn/gamma ()
|
||||
[1.80 2.10 2.06 1.91 2.26 1.86 1.81 1.37 1.78 1.85 1.96 1.54 2.04 2.34 2.22 1.99 ],
|
||||
unit_1_0/sub2/bn2/gamma ()
|
||||
[1.57 1.83 1.30 1.25 1.59 1.14 1.26 0.82 1.19 1.10 1.48 1.01 0.82 1.23 1.21 1.14 ],
|
||||
```
|
||||
|
||||
#### Profile Model Parameter
|
||||
|
||||
```shell
|
||||
# Show the number of parameters of all `tf.trainable_variables()` in the model.
|
||||
# Requires --graph_path --op_log_path.
|
||||
# store option for future commands.
|
||||
tfprof> set -account_type_regexes _trainable_variables
|
||||
tfprof> scope -max_depth 4 -select params
|
||||
_TFProfRoot (--/464.15k params)
|
||||
init/init_conv/DW (3x3x3x16, 432/432 params)
|
||||
pool_logit/DW (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/10 params)
|
||||
unit_last/final_bn/beta (64, 64/64 params)
|
||||
unit_last/final_bn/gamma (64, 64/64 params)
|
||||
```
|
||||
|
||||
Where does `_trainable_variables` come from? It is customized operation type
|
||||
defined through the OpLog file.
|
||||
Users can [Define Customized Operation Type](#define-customized-operation-type)
|
||||
|
||||
<b>Following example shows importance of defining customized operation type.</b>
|
||||
In this example, extra `Variables` are created by TensorFlow
|
||||
implicitly and “/Momentum” is appended to their names. They shouldn't be
|
||||
included in you “model capacity” calculation.
|
||||
|
||||
```shell
|
||||
tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
|
||||
_TFProfRoot (--/930.58k params)
|
||||
global_step (1/1 params)
|
||||
init/init_conv/DW (3x3x3x16, 432/864 params)
|
||||
pool_logit/DW (64x10, 640/1.28k params)
|
||||
pool_logit/DW/Momentum (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/20 params)
|
||||
pool_logit/biases/Momentum (10, 10/10 params)
|
||||
unit_last/final_bn/beta (64, 64/128 params)
|
||||
unit_last/final_bn/gamma (64, 64/128 params)
|
||||
unit_last/final_bn/moving_mean (64, 64/64 params)
|
||||
unit_last/final_bn/moving_variance (64, 64/64 params)
|
||||
```
|
||||
|
||||
#### Profile Device Placement
|
||||
|
||||
In this tutorial, a model is split
|
||||
on several gpus at workers and several parameter servers.
|
||||
|
||||
In tfprof, 'device' is an op_type. For example, if op1 and op2 are placed on
|
||||
gpu:0. They share an operation type.
|
||||
|
||||
```shell
|
||||
bazel-bin/tensorflow/tools/tfprof/tfprof \
|
||||
--graph_path=/tmp/graph.pbtxt \
|
||||
--run_meta_path=/tmp/run_meta
|
||||
|
||||
# Looks like ps task 1 is holding twice more parameters than task 0.
|
||||
tfprof> scope -select device,params -account_type_regexes .*ps.*task:0.* -max_depth 1
|
||||
_TFProfRoot (--/25.81m params)
|
||||
tfprof> scope -select device,params -account_type_regexes .*ps.*task:1.* -max_depth 1
|
||||
_TFProfRoot (--/58.84m params)
|
||||
```
|
||||
|
||||
#### Define Customized Operation Type
|
||||
|
||||
First, in Python code, create an `OpLog` proto and add op type
|
||||
information to it:
|
||||
|
||||
```python
|
||||
|
||||
op_log = tfprof_log_pb2.OpLog()
|
||||
entry = op_log.log_entries.add()
|
||||
entry.name = 'pool_logit/DW'
|
||||
entry.types.append('pool_logit')
|
||||
entry = op_log.log_entries.add()
|
||||
entry.name = 'pool_logit/biases'
|
||||
entry.types.append('pool_logit')
|
||||
```
|
||||
|
||||
Second, call write_op_log to write the OpLog proto.
|
||||
|
||||
```python
|
||||
tf.contrib.tfprof.tfprof_logger.write_op_log(
|
||||
sess.graph, /tmp/my_op_log_dir, op_log)
|
||||
|
||||
# Get run-time shape information in order to fill shapes and get flops.
|
||||
tf.contrib.tfprof.tfprof_logger.write_op_log(
|
||||
sess.graph, /tmp/my_op_log_dir, op_log, run_meta)
|
||||
```
|
||||
|
||||
Third, when starting the tfprof tool, specify
|
||||
"--op_log_path /tmp/my_op_log_dir/op_log"
|
||||
|
||||
```shell
|
||||
tfprof> scope -account_type_regexes pool_logit -max_depth 4 -select params
|
||||
_TFProfRoot (--/650 params)
|
||||
pool_logit/DW (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/10 params)
|
||||
```
|
||||
|
||||
Note that `tf.contrib.tfprof.tfprof_logger.write_op_log(...)` automatically
|
||||
assigns all `Variables` inside `tf.trainable_variables()` a customized
|
||||
operation type: `_trainable_variables`.
|
||||
|
||||
|
||||
#### Non-interactive Mode
|
||||
12) Run tfprof in one-shot mode and dump result to file.
|
||||
|
||||
```shell
|
||||
# By default output to stdout. Use -output option to change output types.
|
||||
tfprof scope --graph_path=graph.pbtxt \
|
||||
--max_depth=3 \
|
||||
--output="file:outfile=/tmp/dump"
|
||||
Reading Files...
|
||||
Parsing GraphDef...
|
||||
Preparing Views...
|
||||
|
||||
cat /tmp/dump
|
||||
_TFProfRoot (--/930.58k params)
|
||||
global_step (0/0 params)
|
||||
pool_logit/DW (64x10, 640/1.28k params)
|
||||
pool_logit/biases (10, 10/20 params)
|
||||
```
|
||||
85
tensorflow/tools/tfprof/g3doc/options.md
Normal file
85
tensorflow/tools/tfprof/g3doc/options.md
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
##Options
|
||||
|
||||
###Overview
|
||||
|
||||
For all tfprof views, the statistics are processed with the following procedures
|
||||
|
||||
1) An in-memory data structure is used represent the view.
|
||||
|
||||
2) `-account_type_regexes` is used to first select the operations that match
|
||||
the specified operation types. An operation has its default type
|
||||
(e.g. MatMul, Conv2D). `tfprof` also considers device as operation type.
|
||||
User can also define customized operation type. Hence, an operation has
|
||||
multiple types. Operations with matched
|
||||
types are selected for display and their statistics are aggregated
|
||||
by the in-memory data structure.
|
||||
|
||||
3) Various `-xxx_name_regexes`, `-min_xxx`, `-max_depth` etc options are then
|
||||
applied to further filter based on names and values.
|
||||
It's no limited operation name. In code view,
|
||||
it's the code trace. In op view, it's the operation type name. Different
|
||||
from `-account_type_regexes`, Statistics are used even if a name is not displayed.
|
||||
For example, in code view, a callee might be hidden by its statistics is
|
||||
still aggregated by it's caller. `-account_displayed_op_only`, however,
|
||||
breaks the rule and only use statistics of displayed names.
|
||||
|
||||
4) Finally, the filtered data structure is displayed in a format depending
|
||||
on the `-output` option.
|
||||
|
||||
####Option Semantics In Different View
|
||||
options usually have the same semantics in different views. However, some
|
||||
can vary. For example `-max_depth` in scope view means the depth of
|
||||
name scope <b>tree</b>. In op view, it means the length of operation <b>list</b>.
|
||||
In graph view, in means the number of hops in the <b>graph</b>.
|
||||
|
||||
|
||||
###Docs
|
||||
|
||||
`-max_depth`: Show ops that are at most this number of hops from starting op in the tree/graph structure.
|
||||
|
||||
`-min_bytes`: Show ops that request at least this number of bytes.
|
||||
|
||||
`-min_micros`: Show ops that spend at least this number of microseconds to run.
|
||||
|
||||
`-min_params`: Show ops that contains at least this number of parameters.
|
||||
|
||||
`-min_float_ops`: Show ops that contain at least this number of float operations. Only available if an op has op.RegisterStatistics() defined and OpLog is provided
|
||||
|
||||
`-min_occurrence`: Show ops that appear at least this number of times. Only available in "op" view.
|
||||
|
||||
`-step`: Show the stats of the this step when multiple steps of RunMetadata were added. By default, show the average of all steps."
|
||||
|
||||
`-order_by`: Order the results by [name|depth|bytes|micros|params|float_ops|occurrence]
|
||||
|
||||
`-account_type_regexes`: Account and display the ops whose types match one of the type regexes specified. tfprof allow user to define extra op types for ops through tensorflow.tfprof.OpLog proto. regexes are comma-sperated.
|
||||
|
||||
`-start_name_regexes`: Show ops starting from the ops that matches the regexes, recursively. regexes are comma-separated.
|
||||
|
||||
`-trim_name_regexes`: Hide ops starting from the ops that matches the regexes, recursively, regexes are comma-seprated.
|
||||
|
||||
`-show_name_regexes`: Show ops that match the regexes. regexes are comma-seprated.
|
||||
|
||||
`-hide_name_regexes`: Hide ops that match the regexes. regexes are comma-seprated.
|
||||
|
||||
Notes: For each op, `-account_type_regexes` is first evaluated, only ops with
|
||||
types matching the specified regexes are accounted and selected for displayed.
|
||||
`-start/trim/show/hide_name_regexes` are used to further filter ops for display.
|
||||
`-start_name_regexes` is evaluated first to search the starting ops to display.
|
||||
Descendants of starting ops are then evaluated against `-show/hide_name_regexes`
|
||||
to make display decision. If an op matches trim_name_regexes, all its
|
||||
descendants are hidden. Ops statistics are *accounted even if they are hidden*
|
||||
as long as they match the `-account_xxx` options.
|
||||
|
||||
`-account_displayed_op_only`: If True, only account the statistics of ops eventually displayed. If False, account all op statistics matching -account_type_regexes recursively.
|
||||
|
||||
`-select`: Comma-separated list of metrics to show: [bytes|micros|params|float_ops|occurrence|tensor_value|device|op_types].
|
||||
|
||||
`-output`: Output results as stdout, file or timeline.
|
||||
The format is ```output_type:key=value,key=value```.
|
||||
For example: ```-output timeline:outfile=<filename>```.
|
||||
|
||||
```shell
|
||||
timeline: key=outfile, value=<filename>.
|
||||
stdout: none.
|
||||
file: key=outfile, value=<filename>.
|
||||
```
|
||||
81
tensorflow/tools/tfprof/g3doc/profile_memory.md
Normal file
81
tensorflow/tools/tfprof/g3doc/profile_memory.md
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
##Profile Memory
|
||||
|
||||
It is generally a good idea to visualize the memory usage in timeline.
|
||||
It allows you to see the memory consumption of each GPU over time.
|
||||
|
||||
```python
|
||||
#To get memory information, you need --graph_path and --run_meta_path
|
||||
tfprof> graph -max_depth 10000000 -step 0 -account_type_regexes .* -output timeline:outfile=<filename>
|
||||
generating trace file.
|
||||
|
||||
******************************************************
|
||||
Timeline file is written to <filename>
|
||||
Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
|
||||
******************************************************
|
||||
```
|
||||
|
||||
<left>
|
||||
TODO(xpan): Show the image correctly in github.
|
||||

|
||||
</left>
|
||||
|
||||
|
||||
```python
|
||||
# You can also visualize the memory information through other methods.
|
||||
|
||||
# With op view, it shows you the aggregated output tensor bytes of each
|
||||
# operation type.
|
||||
tfprof> op -select bytes -order_by bytes
|
||||
node name | output bytes
|
||||
Identity 32515.37MB (100.00%, 27.02%)
|
||||
FusedBatchNormGrad 10802.14MB (72.98%, 8.98%)
|
||||
FusedBatchNorm 10517.52MB (64.01%, 8.74%)
|
||||
Conv2D 10509.25MB (55.27%, 8.73%)
|
||||
Conv2DBackpropInput 9701.39MB (46.54%, 8.06%)
|
||||
ReluGrad 9206.45MB (38.48%, 7.65%)
|
||||
Relu 8462.80MB (30.83%, 7.03%)
|
||||
DepthwiseConv2dNativeBackpropInput 7899.35MB (23.80%, 6.56%)
|
||||
DepthwiseConv2dNative 7425.17MB (17.23%, 6.17%)
|
||||
MaxPoolGrad 3015.44MB (11.06%, 2.51%)
|
||||
AddN 2741.49MB (8.56%, 2.28%)
|
||||
|
||||
# With scope view, you can see the operations that outputs largest tensors.
|
||||
tfprof> scope -order_by bytes -select bytes -min_bytes 100000000
|
||||
node name | output bytes
|
||||
_TFProfRoot (--/120356.38MB)
|
||||
tower_3/SepConv2d_2b_3x3/separable_conv2d (346.85MB/854.00MB)
|
||||
tower_3/SepConv2d_2b_3x3/separable_conv2d/depthwise (507.15MB/507.15MB)
|
||||
tower_0/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB)
|
||||
tower_0/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB)
|
||||
tower_2/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB)
|
||||
tower_2/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB)
|
||||
tower_1/SepConv2d_2b_3x3/separable_conv2d (346.85MB/693.71MB)
|
||||
tower_1/SepConv2d_2b_3x3/separable_conv2d/depthwise (346.85MB/346.85MB)
|
||||
tower_3/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB)
|
||||
tower_3/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB)
|
||||
tower_2/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB)
|
||||
tower_2/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB)
|
||||
tower_0/SepConv2d_2a_3x3/separable_conv2d (346.85MB/520.28MB)
|
||||
tower_0/SepConv2d_2a_3x3/separable_conv2d/depthwise (173.43MB/173.43MB)
|
||||
...
|
||||
|
||||
# code view.
|
||||
tfprof> code -max_depth 10 -select bytes -order_by bytes -start_name_regexes .*seq2seq.* -min_bytes 1
|
||||
node name | output bytes
|
||||
_TFProfRoot (--/74148.60MB)
|
||||
seq2seq_attention.py'>:168:run_filename_from...:none (0B/74148.60MB)
|
||||
seq2seq_attention.py'>:33:_run_code_in_main:none (0B/74148.60MB)
|
||||
seq2seq_attention.py:316:<module>:app.run() (0B/74148.60MB)
|
||||
app.py:432:run:_run_main(main or... (0B/74148.60MB)
|
||||
app.py:352:_run_main:sys.exit(main(arg... (0B/74148.60MB)
|
||||
seq2seq_attention.py:270:main:_Train(model, bat... (0B/74148.60MB)
|
||||
seq2seq_attention.py:128:_Train:model.build_graph() (0B/74148.60MB)
|
||||
seq2seq_attention_model.py:363:build_graph:self._add_train_o... (0B/48931.86MB)
|
||||
seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self... (0B/46761.06MB)
|
||||
seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)... (0B/2170.80MB)
|
||||
seq2seq_attention_model.py:312:_add_train_op:tf.train.exponent... (0B/2.56KB)
|
||||
seq2seq_attention_model.py:308:_add_train_op:tf.summary.scalar... (0B/64B)
|
||||
seq2seq_attention_model.py:320:_add_train_op:tf.summary.scalar... (0B/64B)
|
||||
seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0B/25216.74MB)
|
||||
seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0B/21542.55MB)
|
||||
```
|
||||
87
tensorflow/tools/tfprof/g3doc/profile_model_architecture.md
Normal file
87
tensorflow/tools/tfprof/g3doc/profile_model_architecture.md
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
##Profile Model Architecture
|
||||
|
||||
* [Profile Model Parameters](#profile-model-parameters)
|
||||
* [Profile Model Float Operations](#profile-model-float-operations)
|
||||
|
||||
###Profile Model Parameters
|
||||
|
||||
<b>Notes:</b>
|
||||
`VariableV2` operation type might contain variables created by TensorFlow
|
||||
implicitly. User normally don't want to count them as "model capacity".
|
||||
We can use customized operation type to select a subset of variables.
|
||||
For example `_trainable_variables` is created automatically by tfprof Python
|
||||
API. User can also define customized operation type.
|
||||
|
||||
```
|
||||
# parameters are created by operation type 'VariableV2' (For older model,
|
||||
# it's 'Variable'). scope view is usually suitable in this case.
|
||||
tfprof> scope -account_type_regexes VariableV2 -max_depth 4 -select params
|
||||
_TFProfRoot (--/930.58k params)
|
||||
global_step (1/1 params)
|
||||
init/init_conv/DW (3x3x3x16, 432/864 params)
|
||||
pool_logit/DW (64x10, 640/1.28k params)
|
||||
pool_logit/DW/Momentum (64x10, 640/640 params)
|
||||
pool_logit/biases (10, 10/20 params)
|
||||
pool_logit/biases/Momentum (10, 10/10 params)
|
||||
unit_last/final_bn/beta (64, 64/128 params)
|
||||
unit_last/final_bn/gamma (64, 64/128 params)
|
||||
unit_last/final_bn/moving_mean (64, 64/64 params)
|
||||
unit_last/final_bn/moving_variance (64, 64/64 params)
|
||||
|
||||
# The Python API profiles tf.trainable_variables() instead of VariableV2.
|
||||
#
|
||||
# By default, it's printed to stdout. User can update tfprof_options['output']
|
||||
# to write to file. The result is always returned as a proto buffer.
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.
|
||||
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
|
||||
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
|
||||
```
|
||||
|
||||
###Profile Model Float Operations
|
||||
|
||||
####Caveats
|
||||
|
||||
For an operation to have float operation statistics:
|
||||
|
||||
* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
|
||||
use the definition to calculate float operations. Contributes are welcome.
|
||||
|
||||
* It must have known "shape" information for RegisterStatistics('flops')
|
||||
to calculate the statistics. It is suggested to pass in `-run_meta_path` if
|
||||
shape is only known during runtime. tfprof can fill in the missing shape with
|
||||
the runtime shape information from RunMetadata.
|
||||
|
||||
Hence, it is suggested to use `-account_displayed_name_only`
|
||||
option so that you know the statistics are only for the operations printed out.
|
||||
|
||||
|
||||
```python
|
||||
# To profile float opertions in commandline, you need to pass --graph_path
|
||||
# and --op_log_path.
|
||||
tfprof> scope -min_float_ops 1 -select float_ops -account_displayed_op_only
|
||||
node name | # float_ops
|
||||
_TFProfRoot (--/17.63b flops)
|
||||
gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul (163.84k/163.84k flops)
|
||||
gradients/pool_logit/xw_plus_b/MatMul_grad/MatMul_1 (163.84k/163.84k flops)
|
||||
init/init_conv/Conv2D (113.25m/113.25m flops)
|
||||
pool_logit/xw_plus_b (1.28k/165.12k flops)
|
||||
pool_logit/xw_plus_b/MatMul (163.84k/163.84k flops)
|
||||
unit_1_0/sub1/conv1/Conv2D (603.98m/603.98m flops)
|
||||
unit_1_0/sub2/conv2/Conv2D (603.98m/603.98m flops)
|
||||
unit_1_1/sub1/conv1/Conv2D (603.98m/603.98m flops)
|
||||
unit_1_1/sub2/conv2/Conv2D (603.98m/603.98m flops)
|
||||
|
||||
# Some might prefer op view that aggregate by operation type.
|
||||
tfprof> op -min_float_ops 1 -select float_ops -account_displayed_op_only -order_by float_ops
|
||||
node name | # float_ops
|
||||
Conv2D 17.63b float_ops (100.00%, 100.00%)
|
||||
MatMul 491.52k float_ops (0.00%, 0.00%)
|
||||
BiasAdd 1.28k float_ops (0.00%, 0.00%)
|
||||
|
||||
# You can also do that in Python API.
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
|
||||
```
|
||||
147
tensorflow/tools/tfprof/g3doc/profile_time.md
Normal file
147
tensorflow/tools/tfprof/g3doc/profile_time.md
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
##Profile Time
|
||||
|
||||
* [Profile by Python Code](#profile-by-python-code)
|
||||
* [Profile by Operation Type](#profile-by-operation-type)
|
||||
* [Profile by Graph](#profile-by-graph)
|
||||
* [Profile by Name Scope](#profile-by-name-scope)
|
||||
|
||||
###Profile by Python Code
|
||||
```python
|
||||
# In code view, the time of each line of Python code is the aggregated
|
||||
# times of all operations created by that line.
|
||||
# In command line, it requires --graph_path --op_log_path and --run_meta_path.
|
||||
# --op_log_path provides the code traces information.
|
||||
# --run_meta_path provides the time information.
|
||||
|
||||
tfprof> code -show_name_regexes seq2seq_attention.* -max_depth 10 -select micros -order_by micros
|
||||
node name | execution time
|
||||
_TFProfRoot (--/3.74sec)
|
||||
seq2seq_attention.py'>:168:run_filename_from...:none (0us/3.74sec)
|
||||
seq2seq_attention.py'>:33:_run_code_in_main:none (0us/3.74sec)
|
||||
seq2seq_attention.py:316:<module>:app.run() (0us/3.74sec)
|
||||
seq2seq_attention.py:270:main:_Train(model, bat... (0us/3.74sec)
|
||||
seq2seq_attention.py:128:_Train:model.build_graph() (0us/3.74sec)
|
||||
seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec)
|
||||
seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec)
|
||||
seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0us/265.31ms)
|
||||
seq2seq_attention_model.py:253:_add_seq2seq:initial_state_att... (0us/50.35ms)
|
||||
seq2seq_attention_model.py:173:_add_seq2seq:for x in encoder_... (0us/8.72ms)
|
||||
seq2seq_attention_model.py:218:_add_seq2seq:w_t = tf.transpos... (0us/2.39ms)
|
||||
...
|
||||
seq2seq_attention_model.py:363:build_graph:self._add_train_o... (0us/949.10ms)
|
||||
seq2seq_attention_model.py:307:_add_train_op:tf.gradients(self... (0us/641.44ms)
|
||||
seq2seq_attention_model.py:322:_add_train_op:zip(grads, tvars)... (0us/307.56ms)
|
||||
...
|
||||
seq2seq_attention_model.py:364:build_graph:self._summaries =... (0us/13us)
|
||||
seq2seq_attention_model.py:361:build_graph:self.global_step ... (0us/12us)
|
||||
...
|
||||
seq2seq_attention.py:129:_Train:saver = tf.train.... (0us/0us)
|
||||
seq2seq_attention.py:140:_Train:global_step=model... (0us/0us)
|
||||
|
||||
# Sometimes you want to explore a specific function. You can do that
|
||||
# with -start_name_regexes.
|
||||
tfprof> code -start_name_regexes .*_add_seq2seq.* -show_name_regexes seq2seq_attention.* -max_depth 10 -select micros -order_by micros
|
||||
node name | execution time
|
||||
_TFProfRoot (--/3.74sec)
|
||||
seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec)
|
||||
seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec)
|
||||
seq2seq_attention_model.py:289:sampled_loss_func:num_classes=vsize) (0us/2.46sec)
|
||||
seq2seq_attention_model.py:282:sampled_loss_func:labels = tf.resha... (0us/164us)
|
||||
|
||||
# You can also dive deeper into tensorflow's libraries.
|
||||
tfprof> code -max_depth 5 -select micros -order_by micros -start_name_regexes .*_add_seq2seq.* -min_micros 100000
|
||||
_TFProfRoot (--/3.74sec)
|
||||
seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0us/2.79sec)
|
||||
seq2seq_attention_model.py:293:_add_seq2seq:decoder_outputs, ... (0us/2.46sec)
|
||||
seq2seq_lib.py:181:sampled_sequence_...:average_across_ti... (0us/2.46sec)
|
||||
seq2seq_lib.py:147:sequence_loss_by_...:crossent = loss_f... (0us/2.46sec)
|
||||
seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0us/265.31ms)
|
||||
seq2seq_lib.py:104:bidirectional_rnn:sequence_length, ... (0us/127.27ms)
|
||||
core_rnn.py:195:static_rnn:state_size=cell.s... (0us/127.20ms)
|
||||
seq2seq_lib.py:110:bidirectional_rnn:initial_state_bw,... (0us/125.96ms)
|
||||
core_rnn.py:195:static_rnn:state_size=cell.s... (0us/125.86ms)
|
||||
|
||||
|
||||
# It can also be done in Python API
|
||||
opts = model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS.copy()
|
||||
opts['account_type_regexes'] = ['.*']
|
||||
opts['show_name_regexes'] = ['.*model_analyzer_testlib.py.*']
|
||||
opts['account_displayed_op_only'] = False
|
||||
opts['select'] = ['micros']
|
||||
|
||||
tfprof_node = model_analyzer.print_model_analysis(
|
||||
sess.graph, run_meta, tfprof_cmd='code', tfprof_options=opts)
|
||||
```
|
||||
|
||||
You can generate some visualization in code view:
|
||||
Set ```-output timeline:outfile=<filename>``` to generate timeline instead of stdout.
|
||||
<left>
|
||||

|
||||
</left>
|
||||
|
||||
|
||||
###Profile by Operation Type
|
||||
```python
|
||||
# In op view, you can view the aggregated time of each operation type.
|
||||
tfprof> op -select micros,occurrence -order_by micros
|
||||
node name | execution time | op occurrence
|
||||
SoftmaxCrossEntropyWithLogits 1.37sec (100.00%, 36.44%), 30
|
||||
MatMul 618.97ms (63.56%, 16.51%), 3450
|
||||
Add 273.76ms (47.06%, 7.30%), 2180
|
||||
Sub 215.41ms (39.76%, 5.74%), 4372
|
||||
ConcatV2 203.88ms (34.01%, 5.44%), 6098
|
||||
Mul 134.32ms (28.58%, 3.58%), 9427
|
||||
ApplyAdam 92.66ms (25.00%, 2.47%), 27
|
||||
Switch 72.43ms (22.53%, 1.93%), 30654
|
||||
LogUniformCandidateSampler 69.01ms (20.59%, 1.84%), 30
|
||||
Unique 53.50ms (18.75%, 1.43%), 2
|
||||
AddN 50.10ms (17.33%, 1.34%), 5481
|
||||
|
||||
# You might be surprised to see that SoftmaxCrossEntropyWithLogits is
|
||||
# that expensive. As shown below, it is placed on cpu.
|
||||
tfprof> op -select micros,device -order_by micros
|
||||
node name | execution time | assigned devices
|
||||
SoftmaxCrossEntropyWithLogits 1.37sec (100.00%, 36.44%), /job:worker/replica:0/task:0/cpu:0
|
||||
MatMul 618.97ms (63.56%, 16.51%), |/job:worker/replica:0/task:0/cpu:0|/job:worker/replica:0/task:0/gpu:0|/job:worker/replica:0/task:0/gpu:1|/job:worker/replica:0/task:0/gpu:2|/job:worker/replica:0/task:0/gpu:3
|
||||
```
|
||||
|
||||
|
||||
###Profile by Graph
|
||||
|
||||
Usually, use graph view to generate a timeline to visualize the result.
|
||||
|
||||
In the chrome://tracing UI, click "Flow Event" in "View Options" of upper
|
||||
right corner to see the flow of tensors.
|
||||
|
||||
<left>
|
||||
TODO(xpan): Show the image correctly in github.
|
||||

|
||||
</left>
|
||||
|
||||
tfprof options allow users to generate timeline in some advanced ways.
|
||||
|
||||
```python
|
||||
# Only generate timeline for gpu3 and cpu on workers.
|
||||
graph -max_depth 10000000 -step 0 -account_type_regexes .*gpu:3.*,.*worker.*cpu:0.* -output timeline:outfile=<filename.json>
|
||||
generating trace file.
|
||||
|
||||
******************************************************
|
||||
Timeline file is written to <filename.json>.
|
||||
Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
|
||||
******************************************************
|
||||
```
|
||||
|
||||
###Profile by Name Scope
|
||||
|
||||
Usually scope view allows you to pin point the problematic places if you
|
||||
have properly named your operations with tf.name_scope or tf.variable_scope.
|
||||
|
||||
```python
|
||||
tfprof> scope -max_depth 30 -select micros -min_micros 100000 -order_by micros
|
||||
node name | execution time
|
||||
_TFProfRoot (--/8.12sec)
|
||||
tower_3/gradients/tower_3/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (126.34ms/126.34ms)
|
||||
tower_1/gradients/tower_1/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (125.44ms/125.44ms)
|
||||
tower_2/gradients/tower_2/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (124.85ms/124.85ms)
|
||||
tower_0/gradients/tower_0/Conv2d_1a_3x3/convolution_grad/Conv2DBackpropFilter (124.45ms/124.45ms)
|
||||
```
|
||||
144
tensorflow/tools/tfprof/g3doc/python_api.md
Normal file
144
tensorflow/tools/tfprof/g3doc/python_api.md
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
## Python API Tutorials
|
||||
|
||||
* [Parameters and Shapes](#parameters-and-shapes)
|
||||
* [Float Operations](#float-operations)
|
||||
* [Time and Memory](#time-and-memory)
|
||||
* [Visualize](#visualize)
|
||||
* [Multi-step Profiling](#multi-step-profiling)
|
||||
|
||||
```import tensorflow as tf```.
|
||||
|
||||
### Parameters and Shapes.
|
||||
```python
|
||||
# Print trainable variable parameter statistics to stdout.
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.
|
||||
TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
|
||||
|
||||
# Use code view to associate statistics with Python codes.
|
||||
opts = tf.contrib.tfprof.model_analyzer.TRAINABLE_VARS_PARAMS_STAT_OPTIONS
|
||||
opts['show_name_regexes'] = ['.*my_code1.py.*', '.*my_code2.py.*']
|
||||
param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_cmd='code'
|
||||
tfprof_options=opts)
|
||||
|
||||
# param_stats can be tensorflow.tfprof.TFGraphNodeProto or
|
||||
# tensorflow.tfprof.TFMultiGraphNodeProto, depending on the view.
|
||||
# Let's print the root below.
|
||||
sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
|
||||
```
|
||||
|
||||
### Float Operations
|
||||
|
||||
#### Note: See [Caveats](profile_model_architecture.md#caveats) in "Profile Model Architecture" Tutorial
|
||||
``` python
|
||||
# Print to stdout an analysis of the number of floating point operations in the
|
||||
# model broken down by individual operations.
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)
|
||||
```
|
||||
|
||||
### Time and Memory
|
||||
You will first need to run the following set up in your model in order to
|
||||
compute the memory and timing statistics.
|
||||
|
||||
```python
|
||||
# Generate the RunMetadata that contains the memory and timing information.
|
||||
#
|
||||
# Note: When run on GPU, a kernel is first scheduled (enqueued) and then
|
||||
# executed asynchronously. tfprof only tracks the execution time.
|
||||
#
|
||||
run_metadata = tf.RunMetadata()
|
||||
with tf.Session() as sess:
|
||||
_ = sess.run(train_op,
|
||||
options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_metadata)
|
||||
```
|
||||
|
||||
Finally, you may run `print_model_analysis` to explore the timing and memory
|
||||
information of the model.
|
||||
|
||||
``` python
|
||||
# See model_analyzer_test.py for more examples.
|
||||
#
|
||||
# Print to stdout an analysis of the memory usage and the timing information
|
||||
# broken down by python codes.
|
||||
opts = tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
|
||||
opts['show_name_regexes'] = ['.*my_code.py.*']
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
run_meta=run_metadata,
|
||||
tfprof_cmd='code',
|
||||
tfprof_options=opts)
|
||||
|
||||
# Print to stdout an analysis of the memory usage and the timing information
|
||||
# broken down by operations.
|
||||
tf.contrib.tfprof.model_analyzer.print_model_analysis(
|
||||
tf.get_default_graph(),
|
||||
run_meta=run_metadata,
|
||||
tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY)
|
||||
```
|
||||
|
||||
### Visualize
|
||||
|
||||
```
|
||||
To visualize the result of Python API results:
|
||||
Set opts['output'] = 'timeline:outfile=<filename>' to generate a timeline json file.
|
||||
Open a Chrome Browser, open URL chrome://tracing, and load the json file.
|
||||
```
|
||||
|
||||
Below are 2 examples of graph view and scope view. See code view example in later examples.
|
||||
|
||||
<left>
|
||||

|
||||

|
||||
</left>
|
||||
|
||||
### Multi-step Profiling
|
||||
|
||||
tfprof allows you to profile statistics across multiple steps.
|
||||
|
||||
```python
|
||||
opts = model_analyzer.PRINT_ALL_TIMING_MEMORY.copy()
|
||||
opts['account_type_regexes'] = ['.*']
|
||||
|
||||
with session.Session() as sess:
|
||||
r1, r2, r3 = lib.BuildSplitableModel()
|
||||
sess.run(variables.global_variables_initializer())
|
||||
|
||||
# Create a profiler.
|
||||
profiler = model_analyzer.Profiler(sess.graph)
|
||||
# Profile without RunMetadata of any step.
|
||||
pb0 = profiler.profile_name_scope(opts)
|
||||
|
||||
run_meta = config_pb2.RunMetadata()
|
||||
_ = sess.run(r1,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_meta)
|
||||
|
||||
# Add run_meta of step 1.
|
||||
profiler.add_step(1, run_meta)
|
||||
pb1 = profiler.profile_name_scope(opts)
|
||||
|
||||
run_meta2 = config_pb2.RunMetadata()
|
||||
_ = sess.run(r2,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_meta2)
|
||||
# Add run_meta of step 2.
|
||||
profiler.add_step(2, run_meta2)
|
||||
pb2 = profiler.profile_name_scope(opts)
|
||||
|
||||
run_meta3 = config_pb2.RunMetadata()
|
||||
_ = sess.run(r3,
|
||||
options=config_pb2.RunOptions(
|
||||
trace_level=config_pb2.RunOptions.FULL_TRACE),
|
||||
run_metadata=run_meta3)
|
||||
# Add run_meta of step 3.
|
||||
profiler.add_step(3, run_meta3)
|
||||
pb3 = profiler.profile_name_scope(opts)
|
||||
```
|
||||
|
|
@ -219,9 +219,7 @@ std::vector<CodeNode*> TFCode::Account(const std::vector<CodeNode*>& roots,
|
|||
node->ResetTotalStats();
|
||||
std::vector<CodeNode*> act_cnodes = Account(node->children, opts);
|
||||
node->account = ReAccount(node, opts);
|
||||
// LOG(ERROR) << act_cnodes.size() << " " << node->account;
|
||||
if (node->account || !act_cnodes.empty()) {
|
||||
// LOG(ERROR) << node->name();
|
||||
node->show_children.clear();
|
||||
node->ResetTotalStats();
|
||||
node->AddSelfToTotalStats();
|
||||
|
|
|
|||
|
|
@ -81,8 +81,7 @@ const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
|
|||
// 1. Account and aggregate the stats based on the graph structure.
|
||||
// Returns a graph consists of accounted nodes.
|
||||
std::set<string> visits;
|
||||
std::vector<GraphNode*> roots =
|
||||
Account(root_->children, opts, timeline, &visits);
|
||||
std::vector<GraphNode*> roots = Account(root_->children, opts, &visits);
|
||||
for (GraphNode* n : roots) {
|
||||
root_->AggregateTotalStats(n);
|
||||
}
|
||||
|
|
@ -105,7 +104,7 @@ const ShowNode* TFGraph::ShowInternal(const Options& opts, Timeline* timeline) {
|
|||
Format(root->show_children, &root->formatted_str, root->mutable_proto());
|
||||
|
||||
if (timeline) {
|
||||
timeline->GenerateGraphTimeline(root);
|
||||
timeline->GenerateGraphTimeline(root->show_children);
|
||||
}
|
||||
return root;
|
||||
}
|
||||
|
|
@ -208,29 +207,21 @@ std::vector<GraphNode*> TFGraph::PrintGraph(const std::vector<GraphNode*> roots,
|
|||
|
||||
std::vector<GraphNode*> TFGraph::Account(const std::vector<GraphNode*>& roots,
|
||||
const Options& opts,
|
||||
Timeline* timeline,
|
||||
std::set<string>* visits) {
|
||||
std::vector<GraphNode*> act_nodes;
|
||||
for (GraphNode* node : roots) {
|
||||
if (visits->find(node->name()) != visits->end()) continue;
|
||||
visits->insert(node->name());
|
||||
// Depth-first.
|
||||
std::vector<GraphNode*> act_cnodes =
|
||||
Account(node->children, opts, timeline, visits);
|
||||
std::vector<GraphNode*> act_cnodes = Account(node->children, opts, visits);
|
||||
|
||||
node->account = ReAccount(node, opts);
|
||||
if (node->account) {
|
||||
node->show_children.clear();
|
||||
node->ResetTotalStats();
|
||||
node->AddSelfToTotalStats();
|
||||
if (timeline) {
|
||||
timeline->TrackNode(node);
|
||||
}
|
||||
// Aggregate its accounted children stats.
|
||||
for (GraphNode* c : act_cnodes) {
|
||||
if (timeline) {
|
||||
timeline->TrackNodeConnection(node, c);
|
||||
}
|
||||
node->AggregateTotalStats(c);
|
||||
node->show_children.push_back(c);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ class TFGraph : public TFShow {
|
|||
int last_ident, std::set<string>* visits);
|
||||
|
||||
std::vector<GraphNode*> Account(const std::vector<GraphNode*>& roots,
|
||||
const Options& opts, Timeline* timeline,
|
||||
const Options& opts,
|
||||
std::set<string>* visits);
|
||||
|
||||
void Format(const std::vector<GraphNode*> roots, string* display_str,
|
||||
|
|
|
|||
|
|
@ -21,12 +21,95 @@ limitations under the License.
|
|||
|
||||
namespace tensorflow {
|
||||
namespace tfprof {
|
||||
// Notes about start and end time from the NodeExecStats proto.
|
||||
// Notes about start and end time from the NodeExecStats proto:
|
||||
// For GPU, there is no difference between op_end_rel_micros and
|
||||
// all_end_rel_micros. All are kernel times.
|
||||
// For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
|
||||
// some post-processing.
|
||||
// Here, we only consider kernel time for simplicity.
|
||||
// some post-processing. Besides, currently, there is no way to measure
|
||||
// the execution time of async ops accurately.
|
||||
//
|
||||
// Notes about device:
|
||||
// For ops on gpu:
|
||||
// It will appear in three different devices in RunMetadata: 1) gpu:x,
|
||||
// 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
|
||||
// of all different 3). 1) is the op scheduling, pre-processing and
|
||||
// post processing time. 3) is the execution time of GPU kernels on a stream.
|
||||
// For ops on cpu:
|
||||
// It will only appear as cpu:0.
|
||||
|
||||
void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
|
||||
devices_.insert(dev);
|
||||
if (step_stat.all_start_micros() > 0) {
|
||||
if (all_start_micros_ > 0) {
|
||||
all_start_micros_ = std::min(
|
||||
all_start_micros_, static_cast<int64>(step_stat.all_start_micros()));
|
||||
} else {
|
||||
all_start_micros_ = step_stat.all_start_micros();
|
||||
}
|
||||
int64 op_end_rel_micros = step_stat.op_end_rel_micros();
|
||||
// Round quick execution to 1 micro to be semantically robust.
|
||||
if (op_end_rel_micros == 0) {
|
||||
++op_end_rel_micros;
|
||||
}
|
||||
latest_end_rel_micros_ =
|
||||
std::max(latest_end_rel_micros_, op_end_rel_micros);
|
||||
|
||||
op_execs_[dev].push_back(
|
||||
std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
|
||||
|
||||
if (dev.find("stream") != dev.npos && dev.find("stream:all") == dev.npos) {
|
||||
gpu_kernel_execs_[dev].push_back(
|
||||
std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ExecStep::AddMemoryStats(const string& dev,
|
||||
const NodeExecStats& step_stat) {
|
||||
if (mem_initiated_) {
|
||||
return;
|
||||
}
|
||||
mem_initiated_ = true;
|
||||
|
||||
for (const auto& mem : step_stat.memory()) {
|
||||
// TODO(xpan): Fix this hack. Currently the allocator name seems quite
|
||||
// ad-hoc.
|
||||
if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
|
||||
continue;
|
||||
}
|
||||
allocator_bytes_in_use_ =
|
||||
std::max(allocator_bytes_in_use_,
|
||||
static_cast<int64>(mem.allocator_bytes_in_use()));
|
||||
}
|
||||
int64 total_output_bytes = 0;
|
||||
for (const auto& output : step_stat.output()) {
|
||||
if (output.has_tensor_description() &&
|
||||
output.tensor_description().has_allocation_description()) {
|
||||
// TODO(xpan): Maybe allocated_bytes.
|
||||
int64 output_bytes = std::max(output.tensor_description()
|
||||
.allocation_description()
|
||||
.allocated_bytes(),
|
||||
output.tensor_description()
|
||||
.allocation_description()
|
||||
.requested_bytes());
|
||||
uint64 output_ptr =
|
||||
output.tensor_description().allocation_description().ptr();
|
||||
total_output_bytes += output_bytes;
|
||||
output_bytes_[output.slot()] = std::make_pair(output_bytes, output_ptr);
|
||||
}
|
||||
}
|
||||
if (step_stat.has_memory_stats()) {
|
||||
host_temp_bytes_ += step_stat.memory_stats().host_temp_memory_size();
|
||||
host_persistent_bytes_ +=
|
||||
step_stat.memory_stats().host_persistent_memory_size();
|
||||
accelerator_temp_bytes_ +=
|
||||
step_stat.memory_stats().device_temp_memory_size();
|
||||
accelerator_persistent_bytes_ +=
|
||||
step_stat.memory_stats().device_persistent_memory_size();
|
||||
}
|
||||
requested_bytes_ = total_output_bytes;
|
||||
}
|
||||
|
||||
void TFGraphNode::AddStepStat(int64 step, const string& device,
|
||||
const NodeExecStats& step_stat) {
|
||||
string dev = str_util::Lowercase(device);
|
||||
|
|
@ -47,12 +130,48 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
|
|||
}
|
||||
}
|
||||
|
||||
ExecStep& exec = execs_[step];
|
||||
exec.AddTimeStats(dev, step_stat);
|
||||
auto exec = execs_.find(step);
|
||||
if (exec == execs_.end()) {
|
||||
execs_.insert(std::pair<int64, ExecStep>(step, ExecStep(this)));
|
||||
exec = execs_.find(step);
|
||||
}
|
||||
|
||||
exec->second.AddTimeStats(dev, step_stat);
|
||||
|
||||
if (dev == canonical_device_) {
|
||||
exec.AddMemoryStats(dev, step_stat);
|
||||
exec->second.AddMemoryStats(dev, step_stat);
|
||||
}
|
||||
}
|
||||
|
||||
int64 ExecStep::exec_micros() const {
|
||||
int64 total = 0;
|
||||
for (const auto& execs : gpu_kernel_execs_) {
|
||||
for (const auto& exec : execs.second) {
|
||||
total += exec.second;
|
||||
}
|
||||
}
|
||||
if (total > 0) return total;
|
||||
|
||||
// If there is no gpu kernel time, fall back to assume it runs on cpu.
|
||||
// TODO(xpan): No way to track CPU async op timing accurately?
|
||||
if (op_execs_.size() > 1) {
|
||||
fprintf(stderr, "Op: %s has over 1 no-gpu assignment\n",
|
||||
node->name().c_str());
|
||||
}
|
||||
for (const auto& execs : op_execs_) {
|
||||
for (const auto& exec : execs.second) {
|
||||
total += exec.second;
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
bool IsCombinedGPUStream(const string& device) {
|
||||
return device.find("stream:all") != device.npos;
|
||||
}
|
||||
|
||||
bool IsCPUDevice(const string& device) {
|
||||
return device.find("cpu:0") != device.npos;
|
||||
}
|
||||
} // namespace tfprof
|
||||
} // namespace tensorflow
|
||||
|
|
|
|||
|
|
@ -37,10 +37,13 @@ limitations under the License.
|
|||
namespace tensorflow {
|
||||
namespace tfprof {
|
||||
|
||||
class TFGraphNode;
|
||||
|
||||
class ExecStep {
|
||||
public:
|
||||
ExecStep()
|
||||
: all_start_micros_(0),
|
||||
ExecStep(TFGraphNode* node)
|
||||
: node(node),
|
||||
all_start_micros_(0),
|
||||
latest_end_rel_micros_(0),
|
||||
mem_initiated_(false),
|
||||
requested_bytes_(0),
|
||||
|
|
@ -50,99 +53,11 @@ class ExecStep {
|
|||
accelerator_persistent_bytes_(0),
|
||||
allocator_bytes_in_use_(0) {}
|
||||
|
||||
void AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
|
||||
devices_.insert(dev);
|
||||
if (step_stat.all_start_micros() > 0) {
|
||||
if (all_start_micros_ > 0) {
|
||||
all_start_micros_ =
|
||||
std::min(all_start_micros_,
|
||||
static_cast<int64>(step_stat.all_start_micros()));
|
||||
} else {
|
||||
all_start_micros_ = step_stat.all_start_micros();
|
||||
}
|
||||
int64 op_end_rel_micros = step_stat.op_end_rel_micros();
|
||||
// Round quick execution to 1 micro to be semantically robust.
|
||||
if (op_end_rel_micros == 0) {
|
||||
++op_end_rel_micros;
|
||||
}
|
||||
latest_end_rel_micros_ =
|
||||
std::max(latest_end_rel_micros_, op_end_rel_micros);
|
||||
void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
|
||||
|
||||
op_execs_[dev].push_back(
|
||||
std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
|
||||
void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
|
||||
|
||||
if (dev.find("stream") != dev.npos &&
|
||||
dev.find("stream:all") == dev.npos) {
|
||||
gpu_kernel_execs_[dev].push_back(
|
||||
std::make_pair(step_stat.all_start_micros(), op_end_rel_micros));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AddMemoryStats(const string& dev, const NodeExecStats& step_stat) {
|
||||
if (mem_initiated_) {
|
||||
// fprintf(stderr, "Memory initiated twice on %s", dev.c_str());
|
||||
return;
|
||||
}
|
||||
mem_initiated_ = true;
|
||||
|
||||
for (const auto& mem : step_stat.memory()) {
|
||||
// TODO(xpan): Fix this hack. Currently the allocator name seems quite
|
||||
// ad-hoc.
|
||||
if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
|
||||
continue;
|
||||
}
|
||||
allocator_bytes_in_use_ =
|
||||
std::max(allocator_bytes_in_use_,
|
||||
static_cast<int64>(mem.allocator_bytes_in_use()));
|
||||
}
|
||||
int64 total_output_bytes = 0;
|
||||
for (const auto& output : step_stat.output()) {
|
||||
if (output.has_tensor_description() &&
|
||||
output.tensor_description().has_allocation_description()) {
|
||||
// TODO(xpan): Maybe allocated_bytes.
|
||||
int64 output_bytes = std::max(output.tensor_description()
|
||||
.allocation_description()
|
||||
.allocated_bytes(),
|
||||
output.tensor_description()
|
||||
.allocation_description()
|
||||
.requested_bytes());
|
||||
uint64 output_ptr =
|
||||
output.tensor_description().allocation_description().ptr();
|
||||
total_output_bytes += output_bytes;
|
||||
output_bytes_[output.slot()] = std::make_pair(output_bytes, output_ptr);
|
||||
}
|
||||
}
|
||||
if (step_stat.has_memory_stats()) {
|
||||
host_temp_bytes_ += step_stat.memory_stats().host_temp_memory_size();
|
||||
host_persistent_bytes_ +=
|
||||
step_stat.memory_stats().host_persistent_memory_size();
|
||||
accelerator_temp_bytes_ +=
|
||||
step_stat.memory_stats().device_temp_memory_size();
|
||||
accelerator_persistent_bytes_ +=
|
||||
step_stat.memory_stats().device_persistent_memory_size();
|
||||
}
|
||||
requested_bytes_ = total_output_bytes;
|
||||
}
|
||||
|
||||
int64 exec_micros() const {
|
||||
int64 total = 0;
|
||||
for (const auto& execs : gpu_kernel_execs_) {
|
||||
for (const auto& exec : execs.second) {
|
||||
total += exec.second;
|
||||
}
|
||||
}
|
||||
if (total > 0) return total;
|
||||
|
||||
// If there is no gpu kernel time, fall back to assume it runs on cpu.
|
||||
// TODO(xpan): No way to track CPU async op timing accurately?
|
||||
for (const auto& execs : op_execs_) {
|
||||
for (const auto& exec : execs.second) {
|
||||
total += exec.second;
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
int64 exec_micros() const;
|
||||
|
||||
const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs()
|
||||
const {
|
||||
|
|
@ -164,6 +79,7 @@ class ExecStep {
|
|||
int64 allocator_bytes_in_use() const { return allocator_bytes_in_use_; }
|
||||
|
||||
private:
|
||||
TFGraphNode* node;
|
||||
// The earliest/latest time including scheduling and kernel execution.
|
||||
int64 all_start_micros_;
|
||||
int64 latest_end_rel_micros_;
|
||||
|
|
@ -480,6 +396,9 @@ class TFMultiGraphNode {
|
|||
std::map<string, const TFGraphNode*> nodes_;
|
||||
std::map<string, std::unique_ptr<TFMultiGraphNode>> children_;
|
||||
};
|
||||
|
||||
bool IsCombinedGPUStream(const string& device);
|
||||
bool IsCPUDevice(const string& device);
|
||||
} // namespace tfprof
|
||||
} // namespace tensorflow
|
||||
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ class GraphNode : public ShowNode {
|
|||
public:
|
||||
explicit GraphNode(TFGraphNode* node) : ShowNode(node) {}
|
||||
|
||||
bool Trackable(int64 step) { return node->trackable(step); }
|
||||
bool Trackable(int64 step) const { return node->trackable(step); }
|
||||
|
||||
std::vector<GraphNode*> children;
|
||||
std::vector<GraphNode*> show_children;
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ string ChromeTraceFormatter::Format() {
|
|||
return trace_str;
|
||||
}
|
||||
|
||||
void MemoryTracker::TrackNode(int64 step, GraphNode* node) {
|
||||
void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
|
||||
if (!node->Trackable(step)) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -139,8 +139,8 @@ void MemoryTracker::TrackNode(int64 step, GraphNode* node) {
|
|||
}
|
||||
}
|
||||
|
||||
void MemoryTracker::TrackNodeConnection(int64 step, GraphNode* node,
|
||||
GraphNode* src) {
|
||||
void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node,
|
||||
const GraphNode* src) {
|
||||
if (!node->Trackable(step) || !src->Trackable(step)) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -191,36 +191,96 @@ void MemoryTracker::TrackNodeConnection(int64 step, GraphNode* node,
|
|||
}
|
||||
}
|
||||
|
||||
void Timeline::GenerateGraphTimeline(const GraphNode* gnode) {
|
||||
AddGraphNode(gnode);
|
||||
void Timeline::AllocateTimeNodes(GraphNode* gnode) {
|
||||
if (gnode->Trackable(step_)) {
|
||||
TrackNode(gnode);
|
||||
const TFGraphNode* node = gnode->node;
|
||||
for (const auto& kernel_execs : node->op_execs(step_)) {
|
||||
const string& device = kernel_execs.first;
|
||||
if (!IsCombinedGPUStream(device) && !IsCPUDevice(device)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (process_.find(device) == process_.end()) {
|
||||
int64 pid = AllocatePID();
|
||||
process_[device].reset(new Process(device, pid));
|
||||
chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
|
||||
}
|
||||
Process* p = process_[device].get();
|
||||
|
||||
for (const auto& exec : kernel_execs.second) {
|
||||
int64 start_micros = exec.first;
|
||||
int64 exec_micros = exec.second;
|
||||
// TODO(xpan): There might be start time duplication here.
|
||||
if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
|
||||
// TODO(xpan): Give each kernel call a unique_name.
|
||||
tnodes_[device][start_micros].reset(
|
||||
new TimeNode(p, gnode, start_micros, exec_micros));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (GraphNode* n : gnode->show_children) {
|
||||
AllocateTimeNodes(n);
|
||||
}
|
||||
}
|
||||
|
||||
void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
|
||||
for (GraphNode* gnode : gnodes) {
|
||||
AllocateTimeNodes(gnode);
|
||||
}
|
||||
for (auto& process : tnodes_) {
|
||||
for (auto& tn : process.second) {
|
||||
TimeNode* tnode = tn.second.get();
|
||||
for (GraphNode* inp : tnode->node->children) {
|
||||
if (!inp->account || !inp->Trackable(step_)) {
|
||||
continue;
|
||||
}
|
||||
TrackNodeConnection(tnode->node, inp);
|
||||
for (const auto& kernel_execs : inp->node->op_execs(step_)) {
|
||||
if (process.first == kernel_execs.first) {
|
||||
// Not interested in flow withthin the same device.
|
||||
continue;
|
||||
}
|
||||
for (const auto& exec : kernel_execs.second) {
|
||||
int64 start_micros = exec.first;
|
||||
auto cprocess = tnodes_.find(kernel_execs.first);
|
||||
if (cprocess == tnodes_.end()) continue;
|
||||
auto ctn = cprocess->second.find(start_micros);
|
||||
if (ctn == cprocess->second.end()) continue;
|
||||
ctn->second->next_tnodes.push_back(tnode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
AllocateLanes();
|
||||
fprintf(stdout, "generating trace file.\n");
|
||||
// int64 flow_id = 1;
|
||||
int64 flow_id = 1;
|
||||
for (const auto& process : alloc_nodes_) {
|
||||
for (const auto& lane : process.second) {
|
||||
for (const auto& node : lane.second) {
|
||||
TimeNode* tnode = node.second;
|
||||
|
||||
Json::Value args(Json::objectValue);
|
||||
args["name"] = Json::Value(tnode->name);
|
||||
args["op"] = Json::Value(tnode->name);
|
||||
args["name"] = Json::Value(tnode->name());
|
||||
args["op"] = Json::Value(tnode->name());
|
||||
chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
|
||||
process.first, lane.first, "Op",
|
||||
tnode->name, args);
|
||||
tnode->name(), args);
|
||||
// Flow is a directed arrow pointing from src to dst.
|
||||
// TODO(xpan): Disable flow to reduce json file size for now. Need
|
||||
// to think of a better way to make flow interpretable.
|
||||
/*
|
||||
for (TimeNode* next_tnode : node.second->next_tnodes) {
|
||||
chrome_formatter_.EmitFlowStart(
|
||||
tnode->name + "_flow", tnode->start_micros + tnode->exec_micros,
|
||||
tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
|
||||
process.first, lane.first, flow_id);
|
||||
chrome_formatter_.EmitFlowEnd(
|
||||
tnode->name + "_flow", next_tnode->start_micros,
|
||||
tnode->name() + "_flow", next_tnode->start_micros,
|
||||
next_tnode->process->pid, next_tnode->tid, flow_id);
|
||||
flow_id += 1;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -268,51 +328,6 @@ void Timeline::OutputTimeline() {
|
|||
fflush(stdout);
|
||||
}
|
||||
|
||||
std::vector<TimeNode*> Timeline::AddGraphNode(const GraphNode* gnode) {
|
||||
std::vector<TimeNode*> tnodes;
|
||||
if (!gnode) return tnodes;
|
||||
|
||||
std::vector<TimeNode*> shown_cinputs;
|
||||
for (GraphNode* schild : gnode->show_children) {
|
||||
std::vector<TimeNode*> inputs = AddGraphNode(schild);
|
||||
shown_cinputs.insert(shown_cinputs.end(), inputs.begin(), inputs.end());
|
||||
}
|
||||
if (!gnode->node->trackable(step_)) {
|
||||
return shown_cinputs;
|
||||
}
|
||||
|
||||
const TFGraphNode* node = gnode->node;
|
||||
for (const auto& kernel_execs : node->op_execs(step_)) {
|
||||
const string& device = kernel_execs.first;
|
||||
const std::vector<std::pair<int64, int64>>& execs = kernel_execs.second;
|
||||
|
||||
if (process_.find(device) == process_.end()) {
|
||||
int64 pid = AllocatePID();
|
||||
process_[device].reset(new Process(pid));
|
||||
chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
|
||||
}
|
||||
Process* p = process_[device].get();
|
||||
|
||||
for (const auto& exec : execs) {
|
||||
int64 start_micros = exec.first;
|
||||
int64 exec_micros = exec.second;
|
||||
// TODO(xpan): There might be start time duplication here.
|
||||
if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
|
||||
// TODO(xpan): Give each kernel call a unique_name.
|
||||
tnodes_[device][start_micros].reset(
|
||||
new TimeNode(p, node->name(), start_micros, exec_micros));
|
||||
}
|
||||
TimeNode* tnode_ptr = tnodes_[device][start_micros].get();
|
||||
|
||||
for (int i = 0; i < shown_cinputs.size(); i++) {
|
||||
shown_cinputs[i]->next_tnodes.push_back(tnode_ptr);
|
||||
}
|
||||
tnodes.push_back(tnode_ptr);
|
||||
}
|
||||
}
|
||||
return tnodes;
|
||||
}
|
||||
|
||||
void Timeline::AllocateLanes() {
|
||||
for (auto& process : tnodes_) {
|
||||
Process* p = process_[process.first].get();
|
||||
|
|
|
|||
|
|
@ -58,26 +58,29 @@ class ChromeTraceFormatter {
|
|||
|
||||
class Process {
|
||||
public:
|
||||
Process(int64 pid) : pid(pid) {}
|
||||
Process(const string& device, int64 pid) : device(device), pid(pid) {}
|
||||
|
||||
// Each lane is a map from start_time to end_time.
|
||||
std::vector<std::map<int64, int64>> lanes;
|
||||
string device;
|
||||
int64 pid;
|
||||
};
|
||||
|
||||
class TimeNode {
|
||||
public:
|
||||
TimeNode(Process* process, const string& name, int64 start_micros,
|
||||
TimeNode(Process* process, GraphNode* node, int64 start_micros,
|
||||
int64 exec_micros)
|
||||
: process(process),
|
||||
name(name),
|
||||
node(node),
|
||||
start_micros(start_micros),
|
||||
exec_micros(exec_micros),
|
||||
tid(-1) {}
|
||||
virtual ~TimeNode() {}
|
||||
|
||||
const string& name() { return node->name(); }
|
||||
|
||||
Process* process;
|
||||
string name;
|
||||
GraphNode* node;
|
||||
int64 start_micros;
|
||||
int64 exec_micros;
|
||||
int64 tid;
|
||||
|
|
@ -101,9 +104,10 @@ class MemoryTracker {
|
|||
std::map<int64, int64> allocator_stats;
|
||||
};
|
||||
|
||||
void TrackNode(int64 step, GraphNode* node);
|
||||
void TrackNode(int64 step, const GraphNode* node);
|
||||
|
||||
void TrackNodeConnection(int64 step, GraphNode* node, GraphNode* src);
|
||||
void TrackNodeConnection(int64 step, const GraphNode* node,
|
||||
const GraphNode* src);
|
||||
|
||||
const std::map<string, Device>& devices() const { return devices_; }
|
||||
|
||||
|
|
@ -120,13 +124,13 @@ class Timeline {
|
|||
int64 step() const { return step_; }
|
||||
void SetStep(int64 step) { step_ = step; }
|
||||
|
||||
void GenerateGraphTimeline(const GraphNode* gnode);
|
||||
void GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes);
|
||||
|
||||
void GenerateScopeTimeline(const ScopeNode* node);
|
||||
|
||||
void GenerateCodeTimeline(const CodeNode* node);
|
||||
|
||||
void TrackNode(GraphNode* node) { mem_tracker_.TrackNode(step_, node); }
|
||||
void TrackNode(const GraphNode* node) { mem_tracker_.TrackNode(step_, node); }
|
||||
|
||||
void TrackNodeConnection(GraphNode* node, GraphNode* src) {
|
||||
mem_tracker_.TrackNodeConnection(step_, node, src);
|
||||
|
|
@ -165,7 +169,7 @@ class Timeline {
|
|||
<< " children:" << total_micros;
|
||||
}
|
||||
|
||||
std::vector<TimeNode*> AddGraphNode(const GraphNode* gnode);
|
||||
void AllocateTimeNodes(GraphNode* gnode);
|
||||
|
||||
void AllocateLanes();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user