Update operator documentation with markdown descriptions and interfaces (#8085)

* Update operator documentation with markdown descriptions and interfaces * Added rest of updated operator documentation to source files * Commiting local changes for rebase * fixed bracket typo in sqrt_op.cc file * Added updated markdown documentation to remaining completed ops
2025-12-07 12:21:27 +01:00 · 2018-06-15 19:02:24 -04:00 · 2018-06-15 19:02:24 -04:00 · b10c94b507
commit b10c94b507
parent d968614502
38 changed files with 3661 additions and 453 deletions
--- a/caffe2/operators/abs_op.cc
+++ b/caffe2/operators/abs_op.cc
@ -39,12 +39,50 @@ OPERATOR_SCHEMA(Abs)
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
 Calculates the absolute value of the given input tensor, element-wise.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/abs_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Abs",
    ["X"],
    ["Y"]
 )
 workspace.FeedBlob("X", np.random.randn(5).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [ 0.3005476   1.551666   -1.3591481   0.39191285 -0.21866608]
 Y: [0.3005476  1.551666   1.3591481  0.39191285 0.21866608]
 ```
 </details>
 )DOC")
-    .Input(0, "input", "Input tensor")
+    .Input(0, "X", "*(type: Tensor<float\>)* Input tensor.")
    .Output(
        0,
-        "output",
+        "Y",
-        "The absolute value of the input tensor computed element-wise")
+        "*(type: Tensor`<float>`)* Absolute value of input element-wise.")
    .InheritOnnxSchema("Abs");
 OPERATOR_SCHEMA(AbsGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
--- a/caffe2/operators/arg_ops.cc
+++ b/caffe2/operators/arg_ops.cc
@ -97,34 +97,144 @@ OPERATOR_SCHEMA(ArgMax)
    .NumOutputs(1)
    .TensorInferenceFunction(InferTensor)
    .SetDoc(R"DOC(
-Retrive the argmax of the axis dimension. Given an input tensor of shape
+Retrieve the argmax of an axis dimension specified by the `axis`
-[a_0, a_1, ..., a_{n-1}] and two arguments axis as int and keepdims as bool,
+argument. Given an input tensor and two arguments (`axis` and
-returns one output:
+`keepdims`), returns a tensor containing the indices of the largest
- Index tensor which contains the indices of the largest element. It has the
+element along the given axis. If the `keepdims` arg is *True* (default),
-  same dims as X.dims() with the dimension along axis equals 1 when
+the shape of the output tensor matches the input tensor except the
-  keepdims == true otherwise removed.
+`axis` dimension equals 1. Else, the `axis` dimension of the output
 tensor is removed.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/arg_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "ArgMax",
    ["X"],
    ["Indices"],
    axis=2,
    keepdims=False
 )
 workspace.FeedBlob("X", (np.random.randint(10, size=(3,3,3))).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Indices:", workspace.FetchBlob("Indices"))
 ```
 **Result**
 ```
 X: [[[4. 9. 6.]
  [6. 6. 1.]
  [9. 5. 4.]]
 [[6. 7. 4.]
  [7. 9. 1.]
  [3. 2. 8.]]
 [[3. 4. 6.]
  [5. 2. 7.]
  [1. 5. 7.]]]
 Indices: [[1 0 0]
 [1 1 2]
 [2 2 2]]
 ```
 </details>
    )DOC")
-    .Input(0, "X", "Tenor of shape [a_0, a_1, ..., a_{n-1}].")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Indices", "Tensor of indices for the largest values.")
+    .Output(0,
-    .Arg("axis", "The axis to get argmax.")
+      "Indices",
-    .Arg("keepdims", "Whether to keep the axis dim in the output.");
+      "*(type: Tensor`<float>`)* Tensor of indices for the largest values.")
    .Arg("axis", "*(type: int; default: -1)* The axis to get argmax.")
    .Arg("keepdims",
      "*(type: bool; default: True)* If True (default), the output tensor "
      "shape will match the input tensor shape except the `axis` dimension "
      "equals 1. Else, the `axis` dimension of the output tensor is removed.");
 OPERATOR_SCHEMA(ArgMin)
    .NumInputs(1)
    .NumOutputs(1)
    .TensorInferenceFunction(InferTensor)
    .SetDoc(R"DOC(
-Retrive the argmin of the axis dimension. Given an input tensor of shape
+Retrieve the argmin of an axis dimension specified by the `axis`
-[a_0, a_1, ..., a_{n-1}] and two arguments axis as int and keepdims as bool,
+argument. Given an input tensor and two arguments (`axis` and
-returns one output:
+`keepdims`), returns a tensor containing the indices of the smallest
- Index tensor which contains the indices of the largest element. It has the
+element along the given axis. If the `keepdims` arg is *True* (default),
-  same dims as X.dims() with the dimension along axis equals 1 when
+the shape of the output tensor matches the input tensor except the
-  keepdims == true otherwise removed.
+`axis` dimension equals 1. Else, the `axis` dimension of the output
 tensor is removed.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/arg_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "ArgMin",
    ["X"],
    ["Indices"],
    axis=1
 )
 workspace.FeedBlob("X", (np.random.randint(10, size=(5,5))).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Indices:", workspace.FetchBlob("Indices"))
 ```
 **Result**
 ```
 X: [[9. 4. 6. 4. 1.]
  [5. 9. 8. 3. 4.]
  [6. 1. 0. 2. 9.]
  [7. 8. 2. 4. 9.]
  [3. 9. 4. 9. 4.]]
 Indices: [[4]
  [3]
  [2]
  [2]
  [0]]
 ```
 </details>
    )DOC")
-    .Input(0, "X", "Tenor of shape [a_0, a_1, ..., a_{n-1}].")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Indices", "Tensor of indices for the largest values.")
+    .Output(0,
-    .Arg("axis", "The axis to get argmin.")
+      "Indices",
-    .Arg("keepdims", "Whether to keep the axis dim in the output.");
+      "*(type: Tensor`<float>`)* Tensor of indices for the smallest values.")
    .Arg("axis", "*(type: int; default: -1)* The axis to get argmin.")
    .Arg("keepdims",
      "*(type: bool; default: True)* If True (default), the output tensor "
      "shape will match the input tensor shape except the `axis` dimension "
      "equals 1. Else, the `axis` dimension of the output tensor is removed.");
 NO_GRADIENT(ArgMax);
 NO_GRADIENT(ArgMin);
--- a/caffe2/operators/cast_op.cc
+++ b/caffe2/operators/cast_op.cc
@ -97,25 +97,87 @@ OPERATOR_SCHEMA(Cast)
      return out;
    })
    .SetDoc(R"DOC(
-The operator casts the elements of a given input tensor to a data type
+Casts the elements of a given input tensor to a data type specified by the `to`
-specified by the 'to' argument and returns an output tensor of the same size in
+argument and returns an output tensor of the same size in the converted type.
-the converted type. The 'to' argument must be one of the data types specified
+The `to` argument must be one of the data types specified in the *DataType*
-in the 'DataType' enum field in the TensorProto message. If the 'to' argument
+enum field in the TensorProto message (see below). If the `to` argument is not
-is not provided or is not one of the enumerated types in DataType, Caffe2
+provided or is not one of the enumerated types in *DataType*, Caffe2 throws an
-throws an Enforce error.
+Enforce error.
 NOTE: Casting to and from strings is not supported yet.
 TensorProto *DataType* field:
 ```
 message TensorProto {
  ...
  enum DataType {
    UNDEFINED = 0;
    FLOAT = 1;  // float
    INT32 = 2;  // int
    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
    STRING = 4;  // string
    BOOL = 5;  // bool
    UINT8 = 6;  // uint8_t
    INT8 = 7;  // int8_t
    UINT16 = 8;  // uint16_t
    INT16 = 9;  // int16_t
    INT64 = 10;  // int64_t
    FLOAT16 = 12;  // caffe2::__f16, caffe2::float16
    DOUBLE = 13;  // double
  }
 ```
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/cast_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Cast",
    ["X"],
    ["Y"],
    to=2
 )
 workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32)*10)
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [[9.436466   5.8529844  0.54932857]
 [1.1583444  2.9936118  0.22950427]
 [3.9143739  3.4040766  8.905341  ]]
 Y: [[9 5 0]
 [1 2 0]
 [3 3 8]]
 ```
 </details>
 )DOC")
    .Arg(
        "to",
-        "The data type to which the elements of the input tensor are cast."
+        "*(type: int)* Data type to which the elements of the input tensor are "
-        "Strictly must be one of the types from DataType enum in TensorProto")
+        "cast. Strictly must be one of the types from *DataType* enum in "
-    .Input(0, "input", "Input tensor to be cast.")
+        "TensorProto.")
    .Input(0, "X", "*(type: Tensor)* Input tensor to be cast.")
    .Output(
        0,
-        "output",
+        "Y",
-        "Output tensor with the same shape as input with type "
+        "*(type: Tensor`<'to' type>`)* Output tensor with the same shape as "
-        "specified by the 'to' argument")
+        "input with type specified by the `to` argument.")
    .InheritOnnxSchema("Cast");
 // Some Casts are compatible with gradients, but for now we don't support it
--- a/caffe2/operators/ceil_op.cc
+++ b/caffe2/operators/ceil_op.cc
@ -11,12 +11,59 @@ OPERATOR_SCHEMA(Ceil)
    .NumOutputs(1)
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Ceil takes one input data (Tensor<T>) and produces one output data
+Element-wise application of the ceil function ($y=ceil(x)$) to the input tensor
-(Tensor<T>) where the ceil function, y = ceil(x), is applied to
+`X`. Output tensor shape is the same as the input tensor.
-the tensor elementwise. Currently supports only float32.
+
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/ceil_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Ceil",
    ["X"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.uniform(-10, 10, (5,5))).astype(np.float32))
 print("X before running op:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("X after running op:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X before running op:
 [[ 8.44598    -6.5098248  -2.2993476  -7.6859694   0.58566964]
 [-7.846551   -0.03689406  6.9362907  -4.0521703   4.4969673 ]
 [ 0.33355865 -7.895527   -8.393201    9.374202   -2.3930092 ]
 [-6.3061996   3.1403487   3.782099   -8.516556   -2.8387244 ]
 [-2.0164998   4.7663913  -3.422966    0.3636999   8.75713   ]]
 X after running op:
 [[ 9. -6. -2. -7.  1.]
 [-7. -0.  7. -4.  5.]
 [ 1. -7. -8. 10. -2.]
 [-6.  4.  4. -8. -2.]
 [-2.  5. -3.  1.  9.]]
 ```
 </details>
 )DOC")
-    .Input(0, "X", "ND input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Y", "ND input tensor");
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
 // TODO: Write gradient for this when needed
 GRADIENT_NOT_IMPLEMENTED_YET(Ceil);
--- a/caffe2/operators/clip_op.cc
+++ b/caffe2/operators/clip_op.cc
@ -40,24 +40,72 @@ OPERATOR_SCHEMA(Clip)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
-Clip operator limits the given input within an interval. The interval is
+This operator limits the given input within an interval. The interval is
-specified with arguments 'min' and 'max'. They default to
+specified by the `min` and `max` arguments. They default to
-numeric_limits::lowest() and numeric_limits::max() respectively. The clipping
+*numeric_limits::lowest()* and *numeric_limits::max()* respectively. The
-operation can be done in in-place fashion too, where the input and output blobs
+clipping operation can be done in an in-place fashion by using the same output
-are the same.
+blob as the input blob.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/clip_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Clip",
    ["X"],
    ["Y"],
    min=20.0,
    max=60.0
 )
 workspace.FeedBlob("X", (np.random.randint(100, size=(5,5))).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [[45. 16. 59. 99. 48.]
 [12. 44. 46. 82. 28.]
 [ 1. 91. 18.  9. 71.]
 [24. 37. 61. 12. 81.]
 [36. 38. 30. 84. 40.]]
 Y: [[45. 20. 59. 60. 48.]
 [20. 44. 46. 60. 28.]
 [20. 60. 20. 20. 60.]
 [24. 37. 60. 20. 60.]
 [36. 38. 30. 60. 40.]]
 ```
 </details>
 )DOC")
-    .Arg("min", "Minimum value, under which element is replaced by min")
+    .Arg("min", "*(type: float)* Minimum value, under which element is "
-    .Arg("max", "Maximum value, above which element is replaced by max")
+    "replaced by min (default=*numeric_limits::lowest()*).")
    .Arg("max", "*(type: float)* Maximum value, under which element is "
    "replaced by max (default=*numeric_limits::max()*).")
    .Input(
        0,
-        "input",
+        "X",
-        "Input tensor (Tensor<float>) containing elements to be"
+        "*(Tensor`<float>`)* Input tensor within range "
-        "clipped")
+        "[*numeric_limits::lowest()*, *numeric_limits::max()*].")
-    .Input(
+    .Output(
-        1,
+        0,
-        "output",
+        "Y",
-        "Output tensor (Tensor<float>) containing clipped"
+        "*(Tensor`<float>`)* Output tensor clipped within range [`min`, `max`].")
        "input elements")
    .InheritOnnxSchema("Clip");
 OPERATOR_SCHEMA(ClipGradient).NumInputs(2).NumOutputs(1).AllowInplace({{1, 0}});
--- a/caffe2/operators/concat_split_op.cc
+++ b/caffe2/operators/concat_split_op.cc
@ -156,14 +156,13 @@ REGISTER_CPU_OPERATOR(Concat, ConcatOp<CPUContext>);
 OPERATOR_SCHEMA(Concat)
    .NumInputs(1, INT_MAX)
    .NumOutputs(2)
-    .Arg("axis", "Which axis to concat on")
+    .Arg("axis", "*(type: int; default: -1)* Axis to concatenate on.")
    .Arg(
        "order",
-        "Either NHWC or NCHW, will concat on C axis, defaults to NCHW")
+        "*(type: string; default='NCHW')* Order of blob dimensions. Concats on the C dimension.")
    .Arg(
        "add_axis",
-        "Pass 1 to add the axis specified in arg 'axis' to all "
+        "*(type: int)* Pass non-zero integer to add the axis specified in `axis` to all input tensors.")
        "input tensors")
    .TensorInferenceFunction(OpSchema::NeedsAllInputShapes(
      [](const OperatorDef& def,
         const vector<TensorShape>& in) {
@ -238,9 +237,128 @@ OPERATOR_SCHEMA(Concat)
    }))
    .CostInferenceFunction(CostInferenceForConcat)
    .DeviceInferenceFunction(concatOpDevInfer)
-    .SetDoc("Concatenate a list of tensors into a single tensor")
+    .SetDoc(R"DOC(
-    .Output(0, "concat_result", "Concatenated tensor")
+Concatenate a list of tensors into a single tensor. Similar functionality to
-    .Output(1, "split_info", "The dimensions of the inputs.")
+Numpy's [concatenate](https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html)
 function. The `axis` argument specifies what axis along which the arrays will be concatenated.
 When set to non-zero (default=0), the `add_axis` argument adds the axis specified in `axis` to
 all input tensors.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/concat_split_op.h
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Concat",
    ["X1",  "X2"],
    ["Y", "split_info"],
    axis=0
 )
 workspace.FeedBlob("X1", np.array([[1,2],[3,4]]))
 workspace.FeedBlob("X2", np.array([[5,6]]))
 print("X1:", workspace.FetchBlob("X1"))
 print("X2:", workspace.FetchBlob("X2"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 print("split_info:", workspace.FetchBlob("split_info"))
 ```
 **Result**
 ```
 X1: [[1 2]
 [3 4]]
 X2: [[5 6]]
 Y: [[1 2]
 [3 4]
 [5 6]]
 split_info: [2 1]
 ```
 </details>
 <details>
 <summary> <b>Example 2</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Concat",
    ["X1",  "X2"],
    ["Y", "split_info"],
    add_axis=1,
    axis=3
 )
 workspace.FeedBlob("X1", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
 workspace.FeedBlob("X2", np.random.randint(10, size=(1, 1, 5, 5))) # NCHW
 print("X1:", workspace.FetchBlob("X1"))
 print("X2:", workspace.FetchBlob("X2"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 print("split_info:", workspace.FetchBlob("split_info"))
 ```
 **Result**
 ```
 X1: [[[[1 8 3 9 0]
   [6 4 6 5 6]
   [3 9 1 9 9]
   [5 1 0 7 7]
   [9 4 0 0 9]]]]
 X2: [[[[7 0 2 6 1]
   [3 9 4 0 3]
   [5 3 8 9 4]
   [3 4 2 1 0]
   [0 8 8 8 1]]]]
 Y: [[[[[1 8 3 9 0]
    [7 0 2 6 1]]
   [[6 4 6 5 6]
    [3 9 4 0 3]]
   [[3 9 1 9 9]
    [5 3 8 9 4]]
   [[5 1 0 7 7]
    [3 4 2 1 0]]
   [[9 4 0 0 9]
    [0 8 8 8 1]]]]]
 split_info: [1 1]
 ```
 </details>
    )DOC")
    .Input(0, "X1, X2, ...", "*(type: Tensor`<float>`)* List of input tensors.")
    .Output(0, "concat_result", "*(type: Tensor`<float>`)* Concatenated tensor.")
    .Output(1, "split_info", "*(type: Tensor`<int>`)* The dimensions of the inputs.")
    .InheritOnnxSchema("Concat");
 // Backward compatibility names.
--- a/caffe2/operators/cos_op.cc
+++ b/caffe2/operators/cos_op.cc
@ -38,12 +38,53 @@ OPERATOR_SCHEMA(Cos)
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
 Calculates the cosine of the given input tensor, element-wise.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/cos_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Cos",
    ["X"],
    ["Y"]
 )
 workspace.FeedBlob("X", np.random.rand(5).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [0.6816719  0.76771533 0.933932   0.01404487 0.11862425]
 Y: [0.7765203  0.71949923 0.5946774  0.99990135 0.9929724 ]
 ```
 </details>
 )DOC")
-    .Input(0, "input", "Input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
    .Output(
        0,
-        "output",
+        "Y",
-        "The cosine of the input tensor computed element-wise");
+        "*(type: Tensor`<float>`)* Output tensor calculated as the cosine of the input tensor, element-wise.");
 OPERATOR_SCHEMA(CosGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@ -1,8 +1,129 @@
 #include "counter_ops.h"
 #include "caffe2/core/blob_serialization.h"
 namespace caffe2 {
 const char* githubLinks = R"DOC(
  Github Links:
  - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/counter_ops.cc
 )DOC";
 const char* kCountExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 createcounter_op = core.CreateOperator(
    "CreateCounter",
    [],
    ["counter"],
    init_count=5
 )
 retrievecount_op = core.CreateOperator(
    "RetrieveCount",
    ["counter"],
    ["count"]
 )
 checkcounterdone_op = core.CreateOperator(
    "CheckCounterDone",
    ["counter"],
    ["done"]
 )
 countup_op = core.CreateOperator(
    "CountUp",
    ["counter"],
    ["previous_count"],
 )
 countdown_op = core.CreateOperator(
    "CountDown",
    ["counter"],
    ["done"],
 )
 resetcounter_op = core.CreateOperator(
    "ResetCounter",
    ["counter"],
    ["previous_count"],
    init_count=3
 )
 # Create counter
 workspace.RunOperatorOnce(createcounter_op)
 print("'counter' pointer:", workspace.FetchBlob("counter"))
 # Retrieve initial counter value
 workspace.RunOperatorOnce(retrievecount_op)
 print("Initial 'count':", workspace.FetchBlob("count"))
 # Check if counter is done
 workspace.RunOperatorOnce(checkcounterdone_op)
 print("Initial 'done' value:", workspace.FetchBlob("done"))
 # Test CountUp operator
 print("\nTesting CountUp operator...")
 for i in range(5):
    workspace.RunOperatorOnce(countup_op)
    print("'previous_count' after CountUp:", workspace.FetchBlob("previous_count"))
 workspace.RunOperatorOnce(retrievecount_op)
 print("'count' value after CountUp test:", workspace.FetchBlob("count"))
 # Test CountDown operator
 print("\nTesting CountDown operator...")
 for i in range(11):
    workspace.RunOperatorOnce(countdown_op)
    workspace.RunOperatorOnce(retrievecount_op)
    print("'count' value after CountDown: {}\t'done' value: {}".format(workspace.FetchBlob("count"), workspace.FetchBlob("done")))
 ```
 **Result**
 ```
 'counter' pointer: counter, a C++ native class of type std::__1::unique_ptr<caffe2::Counter<long long>, std::__1::default_delete<caffe2::Counter<long long> > >.
 Initial 'count': 5
 Initial 'done' value: False
 Testing CountUp operator...
 'previous_count' after CountUp: 5
 'previous_count' after CountUp: 6
 'previous_count' after CountUp: 7
 'previous_count' after CountUp: 8
 'previous_count' after CountUp: 9
 'count' value after CountUp test: 10
 Testing CountDown operator...
 'count' value after CountDown: 9	'done' value: False
 'count' value after CountDown: 8	'done' value: False
 'count' value after CountDown: 7	'done' value: False
 'count' value after CountDown: 6	'done' value: False
 'count' value after CountDown: 5	'done' value: False
 'count' value after CountDown: 4	'done' value: False
 'count' value after CountDown: 3	'done' value: False
 'count' value after CountDown: 2	'done' value: False
 'count' value after CountDown: 1	'done' value: False
 'count' value after CountDown: 0	'done' value: False
 'count' value after CountDown: -1	'done' value: True
 ```
 </details>
 )DOC";
 namespace {
 /**
 *  @brief CounterSerializer is the serializer for Counter type.
@ -74,60 +195,98 @@ OPERATOR_SCHEMA(CreateCounter)
    .NumInputs(0)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-Creates a count-down counter with initial value specified by the 'init_count'
+Creates a count-down counter with initial value specified by the `init_count`
 argument.
-)DOC")
+
-    .Output(0, "counter", "A blob pointing to an instance of a new counter.")
+)DOC" + (string) githubLinks + (string) kCountExample)
-    .Arg("init_count", "Initial count for the counter, must be >= 0.");
+    .Output(
        0,
        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a new counter.")
    .Arg(
        "init_count",
        "*(type: int; default: 0)* Initial count for the counter, must be >= 0.");
 OPERATOR_SCHEMA(ResetCounter)
    .NumInputs(1)
    .NumOutputs(0, 1)
    .SetDoc(R"DOC(
-Resets a count-down counter with initial value specified by the 'init_count'
+Resets a count-down counter with initial value specified by the `init_count`
 argument.
-)DOC")
+)DOC" + (string) githubLinks + (string) kCountExample)
-    .Input(0, "counter", "A blob pointing to an instance of a new counter.")
+    .Input(
-    .Output(0, "previous_value", "(optional) Previous value of the counter.")
+        0,
-    .Arg("init_count", "Resets counter to this value, must be >= 0.");
+        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
    .Output(
        0,
        "previous_value",
        "*(type: int)* [OPTIONAL] count value BEFORE this operation.")
    .Arg(
        "init_count",
        "*(type: int; default: 0)* Resets counter to this value, must be >= 0.");
 OPERATOR_SCHEMA(CountDown)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-If the internal count value > 0, decreases count value by 1 and outputs false,
+If the internal count value > 0, decreases count value by 1 and outputs False,
-otherwise outputs true.
+otherwise outputs True.
-)DOC")
+)DOC" + (string) githubLinks + (string) kCountExample)
-    .Input(0, "counter", "A blob pointing to an instance of a counter.")
+    .Input(
-    .Output(0, "done", "false unless the internal count is zero.");
+        0,
        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
    .Output(
        0,
        "done",
        "*(type: bool)* False unless the internal count is zero.");
 OPERATOR_SCHEMA(CheckCounterDone)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-If the internal count value <= 0, outputs true, otherwise outputs false,
+If the internal count value <= 0, outputs true, otherwise outputs false.
-)DOC")
+)DOC" + (string) githubLinks + (string) kCountExample)
-    .Input(0, "counter", "A blob pointing to an instance of a counter.")
+    .Input(
-    .Output(0, "done", "true if the internal count is zero or negative.");
+        0,
        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
    .Output(
        0,
        "done",
        "*(type: bool)* True if the internal count is zero or negative, otherwise False.");
 OPERATOR_SCHEMA(CountUp)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-Increases count value by 1 and outputs the previous value atomically
+Increases count value by 1 and outputs the previous value atomically.
-)DOC")
+)DOC" + (string) githubLinks + (string) kCountExample)
-    .Input(0, "counter", "A blob pointing to an instance of a counter.")
+    .Input(
-    .Output(0, "previous_count", "count value BEFORE this operation");
+        0,
        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
    .Output(
        0,
        "previous_count",
        "*(type: int)* Count value BEFORE this operation.");
 OPERATOR_SCHEMA(RetrieveCount)
    .NumInputs(1)
    .NumOutputs(1)
    .ScalarType(TensorProto::INT64)
    .SetDoc(R"DOC(
-Retrieve the current value from the counter.
+Retrieve the current value from the counter as an integer.
-)DOC")
+)DOC" + (string) githubLinks + (string) kCountExample)
-    .Input(0, "counter", "A blob pointing to an instance of a counter.")
+    .Input(
-    .Output(0, "count", "current count value.");
+        0,
        "counter",
        "*(type: Tensor`<ptr>`)* A blob pointing to an instance of a counter.")
    .Output(
        0,
        "count",
        "*(type: int)* Current count value.");
 SHOULD_NOT_DO_GRADIENT(CreateCounter);
 SHOULD_NOT_DO_GRADIENT(ResetCounter);
--- a/caffe2/operators/distance_op.cc
+++ b/caffe2/operators/distance_op.cc
@ -474,12 +474,95 @@ OPERATOR_SCHEMA(DotProduct)
    .NumOutputs(1)
    .IdenticalTypeAndShapeOfInputDim(0, 0)
    .SetDoc(R"DOC(
-Given two input float tensors X, Y, and produces one output float tensor
+Computes and outputs the dot product of the two input float tensors `X` and `Y`.
-of the dot product between X and Y.
+Note that `X` and `Y` must be either 1D or 2D, and they must be the same shape.
 The output tensor is 1D, which represents either the product of each element in
 a respective dimension if the inputs are 1D, or the sum of the products in a
 given dimension if the inputs are 2D matrices. Note that the actual dot product
 is a scalar value, which is effectively the sum of the elements in the 1D
 output tensor.
 For 1D inputs:
 Given two vectors $X = [x_0, x_1, x_2]$ and $Y = [y_0, y_1, y_2]$; $Z = [x_0 * y_0, x_1 * y_1, x_2 * y_2]$
 For 2D inputs:
 Given two matrices:
 $$X = [[x_0^0, x_1^0, x_2^0], \\ [x_0^1, x_1^1, x_2^1], \\ [x_0^2, x_1^2, x_2^2], \\ ..., \\ [x_0^n, x_1^n, x_2^n]]$$
 and
 $$Y = [[y_0^0, y_1^0, y_2^0], \\ [y_0^1, y_1^1, y_2^1], \\ [y_0^2, y_1^2, y_2^2], \\ ..., \\ [y_0^n, y_1^n, y_2^n]]$$
 then
 $$Z =  \biggl[\Big((x_0^0 * y_0^0) + (x_1^0 * y_1^0) + (x_2^0 * y_2^0)\Big), \\ \Big((x_0^1 * y_0^1) + (x_1^1 * y_1^1) + (x_2^1 * y_2^1)\Big), \\ \Big((x_0^2 * y_0^2) + (x_1^2 * y_1^2) + (x_2^2 * y_2^2)\Big), \\ ..., \\ \Big((x_0^n * y_0^n) + (x_1^n * y_1^n) + (x_2^n * y_2^n)\Big)\biggr]$$
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/distance_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "DotProduct",
    ["X",  "Y"],
    ["Z"]
 )
 workspace.FeedBlob("X", np.random.randint(20, size=(5)).astype(np.float32))
 workspace.FeedBlob("Y", np.random.randint(20, size=(5)).astype(np.float32))
 print("X:\n", workspace.FetchBlob("X"))
 print("Y:\n", workspace.FetchBlob("Y"))
 workspace.RunOperatorOnce(op)
 print("Z:\n", workspace.FetchBlob("X"))
 workspace.ResetWorkspace()
 workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32))
 workspace.FeedBlob("Y", np.random.randint(10, size=(3,3)).astype(np.float32))
 print("X:\n", workspace.FetchBlob("X"))
 print("Y:\n", workspace.FetchBlob("Y"))
 workspace.RunOperatorOnce(op)
 print("Z:\n", workspace.FetchBlob("Z"))
 ```
 **Result**
 ```
 X:
 [ 2. 15.  2.  7. 12.]
 Y:
 [ 3. 12.  9.  3. 18.]
 Z:
 [ 2. 15.  2.  7. 12.]
 X:
 [[2. 0. 4.]
 [7. 7. 4.]
 [7. 9. 9.]]
 Y:
 [[2. 0. 8.]
 [9. 6. 1.]
 [7. 8. 0.]]
 Z:
 [ 36. 109. 121.]
 ```
 </details>
 )DOC")
-    .Input(0, "X", "1D or 2D input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* 1D or 2D input tensor.")
-    .Input(1, "Y", "1D or 2D input tensor (must have the same shape as X)")
+    .Input(1, "Y", "*(type: Tensor`<float>`)* 1D or 2D input tensor (must have the same shape as X).")
-    .Output(0, "Z", "1D output tensor")
+    .Output(0, "Z", "*(type: Tensor`<float>`)* 1D output tensor.")
    .CostInferenceFunction(
        OpSchema::CostInferenceFunctionType(CostInferenceForDotProduct));
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@ -77,22 +77,85 @@ OPERATOR_SCHEMA(Dropout)
      return out;
    })
    .SetDoc(R"DOC(
-Dropout takes one input data (Tensor<float>) and produces two Tensor outputs,
+
-output (Tensor<float>) and mask (Tensor<bool>). Depending on whether it is in
+`Dropout` takes one input data tensor (`X`) and produces two tensor outputs, `Y` and
-test mode or not, the output Y will either be a random dropout, or a simple
+`mask`. If the `is_test` argument is zero (default=0), the output `Y` will be the input
-copy of the input. Note that our implementation of Dropout does scaling in
+with random elements zeroed. The probability that a given element is zeroed is
-the training phase, so during testing nothing needs to be done.
+determined by the `ratio` argument.
 If the `is_test` argument is set to non-zero, the output `Y` is exactly the same as the
 input `X`. Note that outputs are scaled by a factor of $\frac{1}{1-ratio}$ during
 training, so that during test time, we can simply compute an identity function. This
 scaling is important because we want the output at test time to equal the expected value
 at training time. Dropout has been proven to be an effective regularization technique to
 prevent overfitting during training.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dropout_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/dropout_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Dropout",
    ["X"],
    ["Y"] + ["mask"],
    ratio=0.5,
    is_test=0
 )
 workspace.FeedBlob("X", np.random.randint(10, size=(5, 5)).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 print("mask:", workspace.FetchBlob("mask"))
 ```
 **Result**
 ```
 X: [[5. 4. 3. 6. 9.]
 [2. 1. 8. 0. 9.]
 [7. 3. 0. 6. 3.]
 [1. 8. 2. 6. 4.]
 [6. 2. 6. 4. 0.]]
 Y: [[ 0.  0.  0. 12. 18.]
 [ 0.  0. 16.  0.  0.]
 [ 0.  0.  0. 12.  6.]
 [ 0.  0.  4.  0.  0.]
 [12.  0.  0.  0.  0.]]
 mask: [[False False False  True  True]
 [False False  True  True False]
 [False False  True  True  True]
 [False False  True False False]
 [ True False False False False]]
 ```
 </details>
 )DOC")
-    .Arg("ratio", "(float, default 0.5) the ratio of random dropout")
+    .Arg("ratio", "*(type: float; default: 0.5)* Probability of an element to be zeroed.")
    .ArgIsTest(
-        "(int) if nonzero, run dropout in test mode where "
+        "*(type: int; default: 0)* If zero (train mode), perform dropout. If non-zero"
-        "the output is simply Y = X.")
+        "(test mode), Y = X.")
-    .Input(0, "data", "The input data as Tensor.")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
-    .Output(0, "output", "The output.")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
    .Output(
        1,
        "mask",
-        "The output mask. If is_test is nonzero, this output is not filled.")
+        "*(type: Tensor`<bool>`)* The output mask containing boolean values for"
        "each element, signifying which elements are dropped out. If `is_test` is" 
        "nonzero, this output is not filled.")
    .InheritOnnxSchema("Dropout");
 OPERATOR_SCHEMA(DropoutGrad)
--- a/caffe2/operators/elementwise_ops_schema.cc
+++ b/caffe2/operators/elementwise_ops_schema.cc
@ -16,38 +16,230 @@ equal shape is specified by the argument "axis", and if it is not set, suffix
 matching is assumed. 1-dim expansion doesn't work yet.
 For example, the following tensor shapes are supported (with broadcast=1):
-
+```
  shape(A) = (2, 3, 4, 5), shape(B) = (,), i.e. B is a scalar
  shape(A) = (2, 3, 4, 5), shape(B) = (5,)
  shape(A) = (2, 3, 4, 5), shape(B) = (4, 5)
  shape(A) = (2, 3, 4, 5), shape(B) = (3, 4), with axis=1
  shape(A) = (2, 3, 4, 5), shape(B) = (2), with axis=0
-
+```
 Argument `broadcast=1` needs to be passed to enable broadcasting.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_op_schema.cc
 )DOC";
-std::function<void(OpSchema&)> MathDocGenerator(const char* name) {
+const char* kAddExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Add",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([[1,2],[3,4]]))
 workspace.FeedBlob("B", np.array([[5,6],[7,8]]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[1 2]
 [3 4]]
 B:
 [[5 6]
 [7 8]]
 C:
 [[ 6  8]
 [10 12]]
 ```
 </details>
 )DOC";
 const char* kSubExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sub",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([[10,12],[4,14]]))
 workspace.FeedBlob("B", np.array([[5,16],[1,19]]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[10 12]
 [ 4 14]]
 B:
 [[ 5 16]
 [ 1 19]]
 C:
 [[ 5 -4]
 [ 3 -5]]
 ```
 </details>
 )DOC";
 const char* kMulExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Mul",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([[1,2],[3,4]]))
 workspace.FeedBlob("B", np.array([[5,6],[7,8]]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[1 2]
 [3 4]]
 B:
 [[5 6]
 [7 8]]
 C:
 [[ 5 12]
 [21 32]]
 ```
 </details>
 )DOC";
 const char* kDivExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Div",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([[18,8],[2,9]]))
 workspace.FeedBlob("B", np.array([[9,2],[3,2]]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[18  8]
 [ 2  9]]
 B:
 [[9 2]
 [3 2]]
 C:
 [[2 4]
 [0 4]]
 ```
 </details>
 )DOC";
 std::function<void(OpSchema&)> MathDocGenerator(const char* name, const char* extra) {
  return [=](OpSchema& schema) {
    string doc = R"DOC(
 Performs element-wise binary {name} (with limited broadcast support).
-{broadcast_doc})DOC";
+{broadcast_doc}
 {extra}
 )DOC";
    ReplaceAll(doc, "{name}", name);
    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
    ReplaceAll(doc, "{extra}", extra);
    schema.SetDoc(doc);
-    schema.Arg("broadcast", "Pass 1 to enable broadcasting");
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting");
    schema.Arg(
        "axis",
-        "If set, defines the broadcast dimensions. See doc for details.");
+        "*(type: int; default: -1)* Axis to concatenate on.");
    schema.Input(
        0,
        "A",
-        "First operand, should share the type with the second operand.");
+        "*(type: Tensor`<float>`)* First operand, should share the type with the second operand.");
    schema.Input(
        1,
        "B",
-        "Second operand. With broadcasting can be of smaller size than A. "
+        "*(type: Tensor`<float>`)* Second operand. With broadcasting can be of smaller size than A. "
-        "If broadcasting is disabled it should be of the same size.");
+        "If broadcasting is disabled it should be of the same size as A.");
-    schema.Output(0, "C", "Result, has same dimensions and type as A");
+    schema.Output(0, "C", "*(type: Tensor`<float>`)* Output tensor with same dimensions and type as A.");
  };
 }
@ -81,7 +273,7 @@ OPERATOR_SCHEMA(Add)
    .AllowInplace({{0, 0}, {1, 0}})
    .CostInferenceFunction(PointwiseCostInference<1>)
    .TensorInferenceFunction(ElementwiseOpShapeInference)
-    .FillUsing(MathDocGenerator("addition"))
+    .FillUsing(MathDocGenerator("addition", kAddExample))
    .InheritOnnxSchema("Add");
 OPERATOR_SCHEMA(AddGradient)
    .NumInputs(3)
@ -94,7 +286,7 @@ OPERATOR_SCHEMA(Sub)
    .AllowInplace({{0, 0}, {1, 0}})
    .CostInferenceFunction(PointwiseCostInference<1>)
    .TensorInferenceFunction(ElementwiseOpShapeInference)
-    .FillUsing(MathDocGenerator("subtraction"))
+    .FillUsing(MathDocGenerator("subtraction", kSubExample))
    .InheritOnnxSchema("Sub");
 OPERATOR_SCHEMA(SubGradient)
    .NumInputs(3)
@ -107,7 +299,7 @@ OPERATOR_SCHEMA(Mul)
    .AllowInplace({{0, 0}, {1, 0}})
    .CostInferenceFunction(PointwiseCostInference<1>)
    .TensorInferenceFunction(ElementwiseOpShapeInference)
-    .FillUsing(MathDocGenerator("multiplication"))
+    .FillUsing(MathDocGenerator("multiplication", kMulExample))
    .InheritOnnxSchema("Mul");
 OPERATOR_SCHEMA(MulGradient)
    .NumInputs(3)
@ -120,7 +312,7 @@ OPERATOR_SCHEMA(Div)
    .AllowInplace({{0, 0}})
    .CostInferenceFunction(PointwiseCostInference<1>)
    .TensorInferenceFunction(ElementwiseOpShapeInference)
-    .FillUsing(MathDocGenerator("division"))
+    .FillUsing(MathDocGenerator("division", kDivExample))
    .InheritOnnxSchema("Div");
 OPERATOR_SCHEMA(DivGradient).NumInputs(4).NumOutputs(2).AllowInplace({{0, 0}});
@ -162,35 +354,270 @@ For example, the following tensor shapes are supported:
        "If broadcasting is disabled it should be of the same size.")
    .Output(0, "C", "Result, has same dimensions and type as B");
 const char* kLTExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "LT",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
 workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A: [ 1  5  2  9 12  3]
 B: [ 1  3  4  9 12  8]
 C: [False False  True False False  True]
 ```
 </details>
 )DOC";
 const char* kLEExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "LE",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
 workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A: [ 1  5  2  9 12  3]
 B: [ 1  3  4  9 12  8]
 C: [ True False  True  True  True  True]
 ```
 </details>
 )DOC";
 const char* kGTExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "GT",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
 workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A: [ 1  5  2  9 12  3]
 B: [ 1  3  4  9 12  8]
 C: [False  True False False False False]
 ```
 </details>
 )DOC";
 const char* kGEExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "GE",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
 workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A: [ 1  5  2  9 12  3]
 B: [ 1  3  4  9 12  8]
 C: [ True  True False  True  True False]
 ```
 </details>
 )DOC";
 const char* kEQExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "EQ",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
 workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A: [ 1  5  2  9 12  3]
 B: [ 1  3  4  9 12  8]
 C: [ True False False  True  True False]
 ```
 </details>
 )DOC";
 const char* kNEExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "NE",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([1, 5, 2, 9, 12, 3]))
 workspace.FeedBlob("B", np.array([1, 3, 4, 9, 12, 8]))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A: [ 1  5  2  9 12  3]
 B: [ 1  3  4  9 12  8]
 C: [False  True  True False False  True]
 ```
 </details>
 )DOC";
 std::function<void(OpSchema&)> ComparisonDocGenerator(
    const char* name,
-    const char* desc) {
+    const char* desc,
    const char* extra) {
  return [=](OpSchema& schema) {
    string doc = R"DOC(
-Performs element-wise {desc} comparison `{name}` (with limited broadcast support).
+Performs element-wise {desc} comparison **{name}** (with limited broadcast support).
-{broadcast_doc})DOC";
+
 {broadcast_doc}
 {extra}
 )DOC";
    ReplaceAll(doc, "{name}", name);
    ReplaceAll(doc, "{desc}", desc);
    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
    ReplaceAll(doc, "{extra}", extra);
    schema.SetDoc(doc);
-    schema.Arg("broadcast", "Pass 1 to enable broadcasting");
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting.");
    schema.Arg(
        "axis",
-        "If set, defines the broadcast dimensions. See doc for details.");
+        "*(type: int; default: -1)* Axis to concatenate on. If set, defines the broadcast dimensions.");
    schema.Input(
        0,
        "A",
-        "First operand, should share the type with the second operand.");
+        "*(type: Tensor`<bool>`)* First operand, should share the type with the second operand.");
    schema.Input(
        1,
        "B",
-        "Second operand. With broadcasting can be of smaller size than A. "
+        "*(type: Tensor`<bool>`)* Second operand. With broadcasting can be of smaller size than `A`. "
        "If broadcasting is disabled it should be of the same size.");
-    schema.Output(0, "C", "Result, has same dimensions and A and type `bool`");
+    schema.Output(0, "C", "*(type: Tensor`<bool>`)* Output tensor with same dimensions as `A`.");
  };
 }
-#define CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(name, symbol, desc)             \
+#define CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(name, symbol, desc, extra)      \
  OPERATOR_SCHEMA(name)                                                        \
      .NumInputs(2)                                                            \
      .NumOutputs(1)                                                           \
@ -210,51 +637,200 @@ Performs element-wise {desc} comparison `{name}` (with limited broadcast support
            return vector<TensorShape>{                                        \
                CreateTensorShape(output_dims, TensorProto::BOOL)};            \
          })                                                                   \
-      .FillUsing(ComparisonDocGenerator(symbol, desc));                        \
+      .FillUsing(ComparisonDocGenerator(symbol, desc, extra));                 \
  SHOULD_NOT_DO_GRADIENT(name)
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equal to");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(EQ, "==", "equal to", kEQExample);
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(NE, "!=", "not equal to");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(NE, "!=", "not equal to", kNEExample);
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LT, "<", "less than");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LT, "<", "less than", kLTExample);
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LE, "<=", "less or equal than");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(LE, "<=", "less or equal than", kLEExample);
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GT, ">", "greater than");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GT, ">", "greater than", kGTExample);
-CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GE, ">=", "greater or equal than");
+CAFFE2_SCHEMA_FOR_BINARY_COMPARISON_OP(GE, ">=", "greater or equal than", kGEExample);
-std::function<void(OpSchema&)> LogicalDocGenerator(const char* name) {
+const char* kAndExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "And",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", (np.random.rand(3, 3) > 0.5))
 workspace.FeedBlob("B", (np.random.rand(3, 3) > 0.5))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[ True False False]
 [False  True False]
 [False False  True]]
 B:
 [[ True False  True]
 [False False False]
 [False False False]]
 C:
 [[ True False False]
 [False False False]
 [False False False]]
 ```
 </details>
 )DOC";
 const char* kOrExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Or",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", (np.random.rand(3, 3) > 0.5))
 workspace.FeedBlob("B", (np.random.rand(3, 3) > 0.5))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[False  True  True]
 [False  True  True]
 [ True  True  True]]
 B:
 [[False  True False]
 [ True  True  True]
 [False  True False]]
 C:
 [[False  True  True]
 [ True  True  True]
 [ True  True  True]]
 ```
 </details>
 )DOC";
 const char* kXorExample = R"DOC(
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Xor",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", (np.random.rand(3, 3) > 0.5))
 workspace.FeedBlob("B", (np.random.rand(3, 3) > 0.5))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("C"))
 ```
 **Result**
 ```
 A:
 [[ True  True  True]
 [False False  True]
 [False  True False]]
 B:
 [[False False False]
 [ True  True  True]
 [False False False]]
 C:
 [[ True  True  True]
 [ True  True False]
 [False  True False]]
 ```
 </details>
 )DOC";
 std::function<void(OpSchema&)> LogicalDocGenerator(const char* name, const char* extra) {
  return [=](OpSchema& schema) {
    string doc = R"DOC(
-Performs element-wise logical operation `{name}` (with limited broadcast support).
+Performs element-wise logical operation **{name}** (with limited broadcast support).
 Both input operands should be of type `bool`.
-{broadcast_doc})DOC";
+
 {broadcast_doc}
 {extra}
    )DOC";
    ReplaceAll(doc, "{name}", name);
    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
    ReplaceAll(doc, "{extra}", extra);
    schema.SetDoc(doc);
-    schema.Arg("broadcast", "Pass 1 to enable broadcasting");
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting.");
    schema.Arg(
        "axis",
-        "If set, defines the broadcast dimensions. See doc for details.");
+        "*(type: int; default: -1)* Axis to concatenate on. If set, defines the broadcast dimensions.");
-    schema.Input(0, "A", "First operand.");
+    schema.Input(0, "A", "*(type: Tensor`<bool>`)* First operand.");
    schema.Input(
        1,
        "B",
-        "Second operand. With broadcasting can be of smaller size than A. "
+        "*(type: Tensor`<bool>`)* Second operand. With broadcasting can be of smaller size than `A`. "
        "If broadcasting is disabled it should be of the same size.");
-    schema.Output(0, "C", "Result, has same dimensions and A and type `bool`");
+    schema.Output(0, "C", "*(type: Tensor`<bool>`)* Output tensor of booleans. Has same dimensions as input `A`.");
  };
 }
-#define CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(name, symbol, onnx_schema) \
+#define CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(name, symbol, onnx_schema, extra) \
  OPERATOR_SCHEMA(name)                                                \
      .NumInputs(2)                                                    \
      .NumOutputs(1)                                                   \
      .AllowInplace({{0, 0}})                                          \
-      .FillUsing(LogicalDocGenerator(symbol))                          \
+      .FillUsing(LogicalDocGenerator(symbol, extra))                   \
      .InheritOnnxSchema(onnx_schema);                                 \
  SHOULD_NOT_DO_GRADIENT(name)
-CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or");
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Or, "or", "Or", kOrExample);
-CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And");
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(And, "and", "And", kAndExample);
-CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor");
+CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP(Xor, "xor", "Xor", kXorExample);
 #undef CAFFE2_SCHEMA_FOR_BINARY_LOGICAL_OP
@ -267,17 +843,17 @@ Both input operands should be of type `bool`.
    ReplaceAll(doc, "{name}", name);
    ReplaceAll(doc, "{broadcast_doc}", kBroadcastDoc);
    schema.SetDoc(doc);
-    schema.Arg("broadcast", "Pass 1 to enable broadcasting");
+    schema.Arg("broadcast", "*(type: int; default: 0)* Pass 1 to enable broadcasting.");
    schema.Arg(
        "axis",
-        "If set, defines the broadcast dimensions. See doc for details.");
+        "*(type: int; default: -1)* Axis to concatenate on. If set, defines the broadcast dimensions.");
-    schema.Input(0, "A", "First operand.");
+    schema.Input(0, "A", "*(type: Tensor)* First operand.");
    schema.Input(
        1,
        "B",
-        "Second operand. With broadcasting can be of smaller size than A. "
+        "*(type: Tensor)* Second operand. With broadcasting can be of smaller size than `A`. "
        "If broadcasting is disabled it should be of the same size.");
-    schema.Output(0, "C", "Result, has same dimensions and type with A.");
+    schema.Output(0, "C", "*(type: Tensor)* Output tensor. Has same dimensions as input `A`.");
  };
 }
@ -286,7 +862,7 @@ Both input operands should be of type `bool`.
      .NumInputs(2)                                       \
      .NumOutputs(1)                                      \
      .AllowInplace({{0, 0}})                             \
-      .FillUsing(LogicalDocGenerator(symbol));            \
+      .FillUsing(BitwiseDocGenerator(symbol));            \
  SHOULD_NOT_DO_GRADIENT(name)
 CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseOr, "bitwise_or");
@ -298,18 +874,111 @@ CAFFE2_SCHEMA_FOR_BINARY_BITWISE_OP(BitwiseXor, "bitwise_xor");
 OPERATOR_SCHEMA(Not)
    .NumInputs(1)
    .NumOutputs(1)
-    .SetDoc(R"DOC(Performs element-wise negation.)DOC")
+    .SetDoc(R"DOC(
-    .Input(0, "X", "Input tensor of type `bool`.")
+Performs element-wise negation on input tensor `X`.
-    .Output(0, "Y", "Output tensor of type `bool`.")
+
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_op_schema.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
 "Not",
 ["X"],
 ["Y"],
 )
 workspace.FeedBlob("X", (np.random.rand(3, 3) > 0.5))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X:
 [[ True False False]
 [False False False]
 [ True  True  True]]
 Y:
 [[False  True  True]
 [ True  True  True]
 [False False False]]
 ```
 </details>
    )DOC")
    .Input(0, "X", "*(Tensor`<bool>`)* Input tensor.")
    .Output(0, "Y", "*(Tensor`<bool>`)* Negated output tensor.")
    .InheritOnnxSchema("Not");
 SHOULD_NOT_DO_GRADIENT(Not);
 OPERATOR_SCHEMA(Sign)
    .NumInputs(1)
    .NumOutputs(1)
-    .SetDoc(R"DOC(Performs element-wise sign.)DOC")
+    .SetDoc(R"DOC(
-    .Input(0, "X", "Input tensor.")
+Computes sign for each element of the input: -1, 0 or 1.
-    .Output(0, "Y", "Output tensor.");
+
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_op_schema.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
 "Sign",
 ["X"],
 ["Y"],
 )
 workspace.FeedBlob("X", (np.random.rand(3, 3).astype(np.float32) - np.random.rand(3, 3).astype(np.float32)))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X:
 [[ 0.02816287  0.22408086 -0.30342305]
 [-0.18481976  0.03948995  0.39698976]
 [-0.63304734 -0.6919183  -0.31524038]]
 Y:
 [[ 1.  1. -1.]
 [-1.  1.  1.]
 [-1. -1. -1.]]
 ```
 </details>
    )DOC")
    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
 SHOULD_NOT_DO_GRADIENT(Sign);
 } // namespace caffe2
--- a/caffe2/operators/elementwise_sum_op.cc
+++ b/caffe2/operators/elementwise_sum_op.cc
@ -23,12 +23,99 @@ OPERATOR_SCHEMA(Sum)
    .InputsCanCrossDevices()
    .IdenticalTypeAndShapeOfInput(0)
    .SetDoc(R"DOC(
-Element-wise sum of each of the input tensors. The first input tensor can be
+Element-wise sum of each of the input tensors. The first input tensor can be used
-used in-place as the output tensor, in which case the sum will be done in
+in-place as the output tensor, in which case the sum will be done in place and
-place and results will be accumulated in input0. All inputs and outputs must
+results will be accumulated the first input tensor. All inputs and outputs must
 have the same shape and data type.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/elementwise_sum_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sum",
    ["A",  "B"],
    ["C"],
 )
 workspace.FeedBlob("A", np.array([[1,2],[3,4]]).astype(np.float32))
 workspace.FeedBlob("B", np.array([[5,6],[7,8]]).astype(np.float32))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("C:", workspace.FetchBlob("A"))
 ```
 **Result**
 ```
 A: [[1. 2.]
 [3. 4.]]
 B: [[5. 6.]
 [7. 8.]]
 C: [[1. 2.]
 [3. 4.]]
 ```
 </details>
 <details>
 <summary> <b>Example 2</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sum",
    ["A",  "B"],
    ["A"],  # inplace
 )
 workspace.FeedBlob("A", np.array([[1,2,5],[8,3,4]]).astype(np.float32))
 workspace.FeedBlob("B", np.array([[9,5,6],[6,7,8]]).astype(np.float32))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("A after Sum:", workspace.FetchBlob("A"))
 ```
 **Result**
 ```
 A: [[1. 2. 5.]
 [8. 3. 4.]]
 B: [[9. 5. 6.]
 [6. 7. 8.]]
 A after Sum: [[10.  7. 11.]
 [14. 10. 12.]]
 ```
 </details>
 )DOC")
-    .Input(0, "data_0", "First of the input tensors. Can be inplace.")
+    .Input(0, "A", "*(type: Tensor`<float>`)* First tensor to be added element-wise.")
-    .Output(0, "sum", "Output tensor. Same dimension as inputs.")
+    .Input(1, "B", "*(type: Tensor`<float>`)* Second tensor to be added element-wise.")
    .Output(0, "C", "*(type: Tensor`<float>`)* Sum of A and B.")
    .InheritOnnxSchema("Sum");
 }
--- a/caffe2/operators/exp_op.cc
+++ b/caffe2/operators/exp_op.cc
@ -15,16 +15,60 @@ OPERATOR_SCHEMA(Exp)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
-Calculates the exponential of the given input tensor, element-wise. This
+Calculates the exponential of the given input tensor ($exp(x)$), element-wise. This
 operation can be done in an in-place fashion too, by providing the same input
 and output blobs.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/exp_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Exp",
    ["X"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
 print("X before running op:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("X after running op:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X before running op:
 [[0.5821691  0.07719802 0.50159824]
 [0.40952456 0.36788362 0.84887683]
 [0.02472685 0.65730894 0.9066397 ]]
 X after running op:
 [[1.7899168 1.080256  1.6513585]
 [1.5061016 1.4446739 2.3370204]
 [1.0250351 1.9295927 2.4759884]]
 ```
 </details>
 )DOC")
-    .Input(0, "input", "Input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
    .Output(
        0,
-        "output",
+        "Y",
-        "The exponential of the input tensor computed "
+        "*(type: Tensor`<float>`)* The exponential of the input tensor computed "
-        "element-wise")
+        "element-wise.")
    .InheritOnnxSchema("Exp");
 namespace {
--- a/caffe2/operators/filler_op.cc
+++ b/caffe2/operators/filler_op.cc
@ -46,48 +46,165 @@ OPERATOR_SCHEMA(ConstantFill)
    .AllowInplace({{0, 0}})
    .TensorInferenceFunction(FillerTensorInference<>)
    .SetDoc(R"DOC(
-The operator fills the elements of the output tensor with a constant value
+This operator fills the elements of the output tensor with a constant value
-specified by the 'value' argument.
+specified by the `value` argument.
-The data type is specified by the 'dtype' argument. The 'dtype' argument must
+- The data type is specified by the `dtype` argument
 be one of the data types specified in the 'DataType' enum field in the
 TensorProto message. If the 'dtype' argument is not provided, the data type of
 'value' is used.
-The output tensor shape is specified by the 'shape' argument. If the number of
+- Currently, the data types supported are *float*, *int32*, *int64*, and *bool*
 input is 1, the shape will be identical to that of the input at run time with
 optional additional dimensions appended at the end as specified by 'extra_shape'
 argument. In that case the 'shape' argument should not be set.
-If input_as_shape is set to true, then the input should be a 1D tensor
+- If the `dtype` argument is not provided, the data type of `value` is used
-containing the desired output shape (the dimensions specified in extra_shape
+
 - The output tensor shape is either specified by the `shape` argument or will
 match the shape of the input tensor if one is provided (if an input tensor is
 provided, a shape argument should not be set)
 - Optional additional dimensions can be appended at the end as specified by
 `extra_shape` argument
 - If `input_as_shape` is set to True, the input should be a 1D tensor
 containing the desired output shape (the dimensions specified in `extra_shape`
 will also be appended)
-NOTE: Currently, it supports data type of float, int32, int64, and bool.
+When specifying `dtype` argument, use the integer keys from the *DataType* enum
 in TensorProto:
 ```
 message TensorProto {
  ...
  enum DataType {
    UNDEFINED = 0;
    FLOAT = 1;  // float
    INT32 = 2;  // int
    BYTE = 3;  // BYTE, when deserialized, is going to be restored as uint8.
    STRING = 4;  // string
    BOOL = 5;  // bool
    UINT8 = 6;  // uint8_t
    INT8 = 7;  // int8_t
    UINT16 = 8;  // uint16_t
    INT16 = 9;  // int16_t
    INT64 = 10;  // int64_t
    FLOAT16 = 12;  // caffe2::__f16, caffe2::float16
    DOUBLE = 13;  // double
  }
 ```
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "ConstantFill",
    [],
    ["Y"],
    shape=(1,5,5)
 )
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 Y: [[[0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0.]]]
 ```
 </details>
 <details>
 <summary> <b>Example 2</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "ConstantFill",
    ["X"],
    ["Y"],
    value=4.0,
    dtype=1,
    extra_shape=(1,2)
 )
 workspace.FeedBlob("X", (np.random.randint(100, size=(3,3))).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [[86. 30. 84.]
 [34. 51.  9.]
 [29. 86. 59.]]
 Y: [[[[4. 4.]]
  [[4. 4.]]
  [[4. 4.]]]
 [[[4. 4.]]
  [[4. 4.]]
  [[4. 4.]]]
 [[[4. 4.]]
  [[4. 4.]]
  [[4. 4.]]]]
 ```
 </details>
 )DOC")
-    .Arg("value", "The value for the elements of the output tensor. Default is 0.0f.")
+    .Arg(
        "value",
        "*(type: primitive; default: 0.0f) value to populate output tensor with.")
    .Arg(
        "dtype",
-        "The data type for the elements of the output tensor."
+        "*(type: int)* The data type for the elements of the output tensor. "
-        "Strictly must be one of the types from DataType enum in TensorProto.")
+        "Strictly must be one of the types from *DataType* enum in TensorProto.")
    .Arg(
        "shape",
-        "The shape of the output tensor."
+        "*(type: int | Tuple(int))* Shape of the output tensor. Cannot pass an "
-        "Cannot set the shape argument and pass in an input at the same time.")
+        "input blob and this arg at the same time.")
    .Arg(
        "extra_shape",
-        "The additional dimensions appended at the end of the shape indicated"
+        "*(type: int | Tuple(int))* Additional dimensions appended at the end "
-        "by the input blob."
+        "of the shape indicated by the input blob. Cannot set this"
-        "Cannot set the extra_shape argument when there is no input blob.")
+        "argument when there is no input blob.")
    .Arg(
        "input_as_shape",
-        "1D tensor containing the desired output shape.  First input must be in CPU context.")
+        "*(type: int | Tuple(int))* 1D tensor containing the desired output "
-    .Input(0, "input", "Input tensor (optional) to provide shape information.")
+        "shape. First input must be in CPU context.")
    .Input(
        0,
        "X",
        "*(type: Tensor)* [OPTIONAL] Input tensor to provide shape information.")
    .Output(
        0,
-        "output",
+        "Y",
-        "Output tensor of constant values specified by 'value'"
+        "*(type: Tensor)* Output tensor of constant values.");
        "argument and its type is specified by the 'dtype' argument");
 OPERATOR_SCHEMA(DiagonalFill)
    .NumInputs(0, 1)
--- a/caffe2/operators/flatten_op.cc
+++ b/caffe2/operators/flatten_op.cc
@ -30,21 +30,68 @@ OPERATOR_SCHEMA(Flatten)
    })
    .SetDoc(R"DOC(
 Flattens the input tensor into a 2D matrix. If input tensor has shape
-(d_0, d_1, ... d_n) then the output will have shape
+$(d_0, d_1, ..., d_n)$ then the output will have shape
-(d_0 X d_1 ... d_(axis-1), d_axis X d_(axis+1) ... X dn)
+$\bigl((d_0 * d_1 * ... * d_{(axis-1)}), (d_{axis} * d_{(axis+1)} * ... * d_n)\bigr)$.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/flatten_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Flatten",
    ["X"],
    ["Y"],
    axis=1
 )
 workspace.FeedBlob("X", np.random.rand(1,3,2,2))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [[[[0.53432311 0.23734561]
   [0.56481598 0.52152617]]
  [[0.33662627 0.32472711]
   [0.17939016 0.97175851]]
  [[0.87226421 0.49045439]
   [0.92470531 0.30935077]]]]
 Y: [[0.53432311 0.23734561 0.56481598 0.52152617 0.33662627 0.32472711
  0.17939016 0.97175851 0.87226421 0.49045439 0.92470531 0.30935077]]
 ```
 </details>
 )DOC")
-    .Input(0, "input", "A tensor of rank >= axis.")
+    .Input(
        0,
        "X",
        "*(type: Tensor)* Input Tensor of rank >= axis.")
    .Output(
        0,
-        "output",
+        "Y",
-        "A 2D tensor with the contents of the input tensor, "
+        "*(type: Tensor)* A 2D tensor with the contents of the input tensor, "
-        "with input dimensions up to axis flattened to the outer dimension "
+        "with input dimensions up to `axis` flattened to the outer dimension "
-        "of the output and remaining input dimensions flattened into the inner "
+        "of the output and the remaining input dimensions flattened into the "
-        "dimension of the output.")
+        "inner dimension of the output.")
    .Arg(
        "axis",
-        "(Default to 1) Indicate up to which input dimensions "
+        "*(type: int; default: 1)* Indicates up to which input dimensions "
-        "(exclusive) should be flattened to the outer dimension of the output")
+        "(exclusive) should be flattened to the outer dimension of the output.")
    .InheritOnnxSchema("Flatten");
 class GetFlattenGradient : public GradientMakerBase {
--- a/caffe2/operators/floor_op.cc
+++ b/caffe2/operators/floor_op.cc
@ -11,12 +11,61 @@ OPERATOR_SCHEMA(Floor)
    .NumOutputs(1)
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Floor takes one input data (Tensor<T>) and produces one output data
+Element-wise application of the floor function ($y=floor(x)$) to the input
-(Tensor<T>) where the floor function, y = floor(x), is applied to
+tensor `X`. Output tensor shape is the same as the input tensor. This
-the tensor elementwise. Currently supports only float32.
+operator can be used in an in-place fashion by using the same input blob as the
 output blob.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/floor_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Floor",
    ["X"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.uniform(-10, 10, (5,5))).astype(np.float32))
 print("X before running op:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("X after running op:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X before running op:
 [[ 3.813361   -1.319647    5.2089314  -4.931328    0.6218652 ]
 [ 7.2757645   5.5552588   5.785643   -2.4790506  -0.41400087]
 [ 1.1541046  -6.933266    3.3754056   1.6569928  -1.7670316 ]
 [-3.4932013   4.891472    1.5530115  -3.2443287  -4.605099  ]
 [-4.574543   -7.360948    5.91305    -8.196495   -5.357458  ]]
 X after running op:
 [[ 3. -2.  5. -5.  0.]
 [ 7.  5.  5. -3. -1.]
 [ 1. -7.  3.  1. -2.]
 [-4.  4.  1. -4. -5.]
 [-5. -8.  5. -9. -6.]]
 ```
 </details>
 )DOC")
-    .Input(0, "X", "ND input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Y", "ND input tensor");
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
 // TODO: Write gradient for this when needed
 GRADIENT_NOT_IMPLEMENTED_YET(Floor);
--- a/caffe2/operators/load_save_op.cc
+++ b/caffe2/operators/load_save_op.cc
@ -20,94 +20,209 @@ OPERATOR_SCHEMA(DBExists)
    .NumInputs(0)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-Checks if the DB exists.
+Checks if the db described by the arguments exists.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "DBExists",
    [],
    ["exists"],
    db_name="test_db",
    db_type="leveldb",
 )
 workspace.RunOperatorOnce(op)
 print("exists:", workspace.FetchBlob("exists"))
 ```
 </details>
 )DOC")
-    .Output(0, "exists", "A scalar bool Tensor.")
+    .Output(0, "exists", "*(type: Tensor`<bool>`)* Scalar boolean output "
    "tensor. True if the db exists, else false.")
    .Arg(
        "absolute_path",
-        "(int, default 0) if set, use the db path directly and do not prepend "
+        "*(type: int; default: 0)* If set to non-zero, save the db directly to "
-        "the current root folder of the workspace.")
+        "the path specified by the `db` arg. If not set (default), prepend the "
-    .Arg("db_name", "(string) the path to the db to load.")
+        "path of the current root folder of the workspace to the path specified "
-    .Arg("db_type", "(string) the type of the db.");
+        "by the `db` arg.")
    .Arg("db_name", "*(type: string)* Path to the db in question; see the "
    "`absolute_path` arg details for options regarding the current root folder "
    "of the workspace.")
    .Arg("db_type", "*(type: string)* Type of db to save (options: \"lmdb\", "
    "\"leveldb\", \"minidb\").");
 OPERATOR_SCHEMA(Load)
    .NumInputs(0, INT_MAX)
    .NumOutputs(0, INT_MAX)
    .SetDoc(R"DOC(
 The Load operator loads a set of serialized blobs from a db or multiple dbs. It
-takes [0, infinity) number of inputs and [0, infinity) number of outputs, using
+takes $[0, \infty)$ number of inputs and $[0, \infty)$ number of outputs, using
 the db keys to match the db entries with the outputs.
 If at least one input is passed, then it is assumed that that input blobs are a
-set of DBReaders to load from. Otherwise the db or dbs argument is used to load
+set of DBReaders to load from. Otherwise the `db` or `dbs` argument is used to load
-blobs from one single db or multiple dbs respectively. db_type argument is used
+blobs from one single db or multiple dbs respectively. `db_type` argument is used
 to specify the type of the input db/dbs.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Load",
    [],
    ["X", "Y"],
    db="test_db",
    db_type="lmdb"
 )
 workspace.RunOperatorOnce(op)
 print("X:", workspace.FetchBlob("X"))
 print("Y:", workspace.FetchBlob("Y"))
 ```
 </details>
 )DOC")
    .Input(
      0,
      "X, Y, ...",
      "*(type: List(DBReader))* [OPTIONAL] List of DBReaders to load from. Can "
      "use this instead of the `db`/`dbs` args.")
    .Arg(
        "absolute_path",
-        "(int, default 0) if set, use the db path directly and do not prepend "
+        "*(type: int; default: 0)* If set to non-zero, save the db directly to "
-        "the current root folder of the workspace.")
+        "the path specified by the `db` arg. If not set (default), prepend the "
        "path of the current root folder of the workspace to the path specified "
        "by the `db` arg.")
    .Arg(
        "add_prefix",
-        "(string, default=\"\") blobs will be prefixed with this when loading."
+        "*(type: string, default: \"\")* Blobs will be prefixed with this when "
-        "Useful for avoiding collisions with blobs existing in the workspace."
+        "loading. Useful for avoiding collisions with blobs existing in the "
-        "The output blob names specified to this op should include this prefix.")
+        "workspace. The output blob names specified to this op should include "
        "this prefix.")
    .Arg(
        "strip_prefix",
-        "(string, default=\"\") characters in the provided blob "
+        "*(type: string, default: \"\")* Characters in the provided blob names "
-        " names that match strip_prefix will be removed prior to loading."
+        "that match `strip_prefix` will be removed prior to saving. Also, "
-        " Also, characters that precede strip_prefix will be removed. Useful "
+        "characters that precede `strip_prefix` will be removed. Useful for "
-        " for removing device scope from blob names.")
+        "removing device scope from blob names.")
-    .Arg("db", "(string) the path to the db to load.")
+    .Arg("db", "*(type: string)* The output path of the db. See the "
        "`absolute_path` arg details for options regarding the current root folder "
        "of the workspace.")
    .Arg(
        "dbs",
-        "(list of strings) the paths to the dbs to load. This is used for loading"
+        "*(type: List(string))* List of paths to dbs to load blobs from. See "
-        " blobs from multiple databases. If it is set, argument in \"db\" will be"
+        "the `absolute_path` arg details for options regarding the current "
-        " ignored.")
+        "root folder of the workspace.")
-    .Arg("db_type", "(string) the type of the db.")
+    .Arg("db_type", "(type: string)* Type of db to save (options: \"lmdb\", "
        "\"leveldb\", \"minidb\").")
    .Arg(
        "keep_device",
-        "(int, default 0) if nonzero, the blobs are loaded into the device that "
+        "*(type: int; default: 0)* If nonzero, the blobs are loaded into the "
-        "is specified in the serialized BlobProto. Otherwise, the device will be "
+        "device that is specified in the serialized `BlobProto`. Otherwise, "
-        "set as the one that the Load operator is being run under.")
+        "the device will be set as the one that the `Load` operator is being "
        "run under.")
    .Arg(
        "load_all",
-        "(int, default 0) if nonzero, will load all blobs pointed to by the db "
+        "*(type: int; default: 0)* If nonzero, will load all blobs pointed to "
-        "to the workspace overwriting/creating blobs as needed.")
+        "by the db to the workspace overwriting/creating blobs as needed.")
    .Arg(
        "allow_incomplete",
-        "(bool, default false) if true, will allow not loading all the output "
+        "*(type: bool; default: False)* If True, will allow not loading all "
-        "blobs specified in the outputs")
+        "the output blobs specified in the outputs.")
    .Arg(
        "source_blob_names",
-        "(list of strings) if set, used instead of output "
+        "*(type: List(string))* If set, used instead of output blob names to "
-        "blob names, to specify which blobs in the db shall be loaded. Must be "
+        "specify which blobs in the db shall be loaded. Must be the same "
-        "the same length as number of output blobs.");
+        "length as number of output blobs.");
 OPERATOR_SCHEMA(Save)
    .NumInputs(1, INT_MAX)
    .NumOutputs(0)
    .SetDoc(R"DOC(
-The Save operator saves a set of blobs to a db. It takes [1, infinity) number
+Saves a set of blobs to a db. It takes $[1, \infty)$ number of inputs and has
-of inputs and has no output. The contents of the inputs are written into the
+no output. The contents of the inputs are written into the db using the
-db specified by the arguments.
+settings specified by the arguments.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/load_save_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Save",
    ["X", "Y", "Z"],
    [],
    db="test_db2",
    db_type="leveldb",
    blob_name_overrides=["x_scores", "y_scores", "z_scores"]
 )
 workspace.FeedBlob("X", np.random.randint(20, size=(5,5)))
 workspace.FeedBlob("Y", np.random.randint(20, size=(5,5)))
 workspace.FeedBlob("Z", np.random.randint(20, size=(5,5)))
 workspace.RunOperatorOnce(op)
 ```
 </details>
 )DOC")
    .Arg(
        "absolute_path",
-        "(int, default 0) if set, use the db path directly and do not prepend "
+        "*(type: int; default: 0)* If set to non-zero, save the db directly to "
-        "the current root folder of the workspace.")
+        "the path specified by the `db` arg. If not set (default), prepend the "
        "path of the current root folder of the workspace to the path specified "
        "by the `db` arg.")
     .Arg(
         "strip_prefix",
-         "(string, default=\"\") characters in the provided blob "
+         "*(type: string, default: \"\")* Characters in the provided blob names "
-         " names that match strip_prefix will be removed prior to saving."
+         "that match `strip_prefix` will be removed prior to saving. Also, "
-         " Also, characters that precede strip_prefix will be removed. Useful "
+         "characters that precede `strip_prefix` will be removed. Useful for "
-         " for removing device scope from blob names.")
+         "removing device scope from blob names.")
    .Arg(
        "blob_name_overrides",
-        "(list of strings) if set, used instead of original "
+        "*(List(string))* If set, used as blob names instead of original blob "
-        "blob names. Must be the same length as number of blobs.")
+        "names. Must be same length as number of blobs.")
-    .Arg("db", "(string) the path to the db to load.")
+    .Arg("db", "*(type: string)* The output path of the db. See the "
-    .Arg("db_type", "(string) the type of the db.");
+    "`absolute_path` arg details for options regarding the current root folder "
    "of the workspace.")
    .Arg("db_type", "*(type: string)* Type of db to save (options: \"lmdb\", "
    "\"leveldb\", \"minidb\").")
    .Input(0, "X", "*(type: Tensor)* Input tensor(s).");
 OPERATOR_SCHEMA(Checkpoint)
    .NumInputs(1, INT_MAX)
--- a/caffe2/operators/local_response_normalization_op.cc
+++ b/caffe2/operators/local_response_normalization_op.cc
@ -303,7 +303,199 @@ bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
 REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
-OPERATOR_SCHEMA(LRN).NumInputs(1).NumOutputs(1, 2).InheritOnnxSchema("LRN");
+OPERATOR_SCHEMA(LRN)
  .NumInputs(1)
  .NumOutputs(1, 2)
  .SetDoc(R"DOC(
 `LRN` applies Local Response Normalization to an input blob. This operation performs
 a kind of "lateral inhibition" by normalizing over local input regions, where 
 normalization is applied across channels. This operator is typically used to 
 normalize an unbounded activation (such as ReLU). The output shape is the same as
 the input shape. The `brew` module has a wrapper for this operator for use in a
 `ModelHelper` object.
 The formula for LRN is as follows:
 $$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator("LRN",
     ["X"],
     ["Y", "Y_scale"],
     size=11,
     alpha=0.001,
     beta=0.5,
     bias=2.0,
     order="NHWC"
 )
 workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) # NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 print("Y_scale:\n", workspace.FetchBlob("Y_scale"))
 ```
 **Result**
 ```
 X:
 [[[[ 0.72985137]
   [-0.3753357 ]
   [ 2.7344604 ]
   [-0.5937792 ]
   [ 0.38440478]
   [-2.1659644 ]]
  [[-0.92846817]
   [-0.9996144 ]
   [ 0.212943  ]
   [-1.968045  ]
   [-0.77839696]
   [ 0.45492038]]
  [[-0.11263168]
   [ 1.9901097 ]
   [ 0.19275683]
   [ 0.15630436]
   [ 0.7536298 ]
   [-0.77339894]]
  [[ 0.8353551 ]
   [-0.7784452 ]
   [ 1.779317  ]
   [ 0.22421335]
   [ 1.3846219 ]
   [-3.0546608 ]]
  [[ 0.09977621]
   [ 2.2071757 ]
   [ 0.79971045]
   [ 3.563886  ]
   [-0.7169287 ]
   [ 0.77170426]]
  [[-1.4296649 ]
   [ 0.19181213]
   [ 0.45961624]
   [-1.0201577 ]
   [ 0.62854475]
   [-0.6395456 ]]]] 
 Y:
 [[[[ 0.5160766 ]
   [-0.26540157]
   [ 1.9332271 ]
   [-0.41986194]
   [ 0.27181432]
   [-1.5314047 ]]
  [[-0.6565133 ]
   [-0.7068181 ]
   [ 0.15057328]
   [-1.3914955 ]
   [-0.5504022 ]
   [ 0.32167578]]
  [[-0.0796426 ]
   [ 1.4070934 ]
   [ 0.13629955]
   [ 0.11052381]
   [ 0.53288984]
   [-0.5468682 ]]
  [[ 0.5906759 ]
   [-0.5504363 ]
   [ 1.2580767 ]
   [ 0.1585426 ]
   [ 0.9790328 ]
   [-2.1595135 ]]
  [[ 0.07055242]
   [ 1.5605361 ]
   [ 0.5654725 ]
   [ 2.5193207 ]
   [-0.50693923]
   [ 0.54567   ]]
  [[-1.0108787 ]
   [ 0.13563155]
   [ 0.3249962 ]
   [-0.72134334]
   [ 0.44444424]
   [-0.45222285]]]]
 Y_scale:
 [[[[2.0000484]
   [2.0000129]
   [2.0006797]
   [2.000032 ]
   [2.0000134]
   [2.0004265]]
  [[2.0000784]
   [2.0000908]
   [2.000004 ]
   [2.0003521]
   [2.000055 ]
   [2.0000188]]
  [[2.0000012]
   [2.00036  ]
   [2.0000033]
   [2.0000021]
   [2.0000517]
   [2.0000544]]
  [[2.0000634]
   [2.000055 ]
   [2.0002878]
   [2.0000045]
   [2.0001743]
   [2.0008483]]
  [[2.000001 ]
   [2.000443 ]
   [2.0000582]
   [2.0011547]
   [2.0000467]
   [2.0000541]]
  [[2.0001857]
   [2.0000033]
   [2.0000193]
   [2.0000947]
   [2.000036 ]
   [2.0000372]]]]
 ```
 </details>
 )DOC")
  .Arg("size", "*(type: int; default: 0)* Amount of neighboring channels to sum over for normalization")
  .Arg("alpha", "*(type: float; default: 0)* Multiplicative (scaling) factor.")
  .Arg("beta", "*(type: float; default: 0)* Exponent.")
  .Arg("bias", "*(type: float; default: 1.0)* Additive factor.")
  .Arg("order", "*(type: float; default: 'NCHW')* Order of blob dimensions.")
  .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor (ReLU output).")
  .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
  .Output(1, "Y_scale", "*(type: Tensor`<float>`)* Output scale.") 
  .InheritOnnxSchema("LRN");
 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
 class GetLRNGradient : public GradientMakerBase {
--- a/caffe2/operators/log_op.cc
+++ b/caffe2/operators/log_op.cc
@ -15,16 +15,59 @@ OPERATOR_SCHEMA(Log)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
-Calculates the natural log of the given input tensor, element-wise. This
+Calculates the natural log of the given input tensor ($ln(x)$), element-wise. This
 operation can be done in an in-place fashion too, by providing the same input
 and output blobs.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/log_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Log",
    ["X"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
 print("X before running op:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("X after running op:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X before running op:
 [[0.07341351 0.15404125 0.386613  ]
 [0.34090295 0.99727786 0.24141751]
 [0.32016268 0.8724168  0.93515724]]
 X after running op:
 [[-2.6116474  -1.8705349  -0.9503311 ]
 [-1.0761575  -0.00272586 -1.4212275 ]
 [-1.138926   -0.13648799 -0.06704059]]
 ```
 </details>
 )DOC")
-    .Input(0, "input", "Input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
    .Output(
        0,
-        "output",
+        "Y",
-        "The natural log of the input tensor computed "
+        "*(type: Tensor`<float>`)* Output tensor computed as the natural log of the input tensor computed, element-wise.")
        "element-wise")
    .InheritOnnxSchema("Log");
 namespace {
--- a/caffe2/operators/matmul_op.cc
+++ b/caffe2/operators/matmul_op.cc
@ -34,28 +34,84 @@ OPERATOR_SCHEMA(MatMul)
      return out;
    })
    .SetDoc(R"DOC(
-Matrix multiplication Y = A * B, where A has size (M x K), B has size (K x N),
+Matrix multiplication $Y = A * B$, where `A` has size (M x K), `B` has size
-and Y will have a size (M x N).
+(K x N), and `Y` will have a size (M x N). To transpose `A` or `B` before
 multiplication, pass 1 to the `trans_a` and/or `trans_b` arguments, which
 separate the first and second dimensions of the respective matrices using
 `axis_a` and `axis_b`.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/matmul_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "MatMul",
    ["A", "B"],
    ["Y"],
 )
 workspace.FeedBlob("A", np.random.randint(10, size=(3,3)).astype(np.float32))
 workspace.FeedBlob("B", np.random.randint(10, size=(3,3)).astype(np.float32))
 print("A:", workspace.FetchBlob("A"))
 print("B:", workspace.FetchBlob("B"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 A: [[1. 8. 3.]
 [6. 4. 4.]
 [5. 4. 7.]]
 B: [[4. 0. 3.]
 [3. 1. 1.]
 [8. 5. 8.]]
 Y: [[52. 23. 35.]
 [68. 24. 54.]
 [88. 39. 75.]]
 ```
 </details>
 )DOC")
-    .Input(0, "A", "2D matrix of size (M x K)")
+    .Input(
-    .Input(1, "B", "2D matrix of size (K x N)")
+        0,
-    .Output(0, "Y", "2D matrix of size (M x N)")
+        "A",
        "*(type: Tensor`<float>`)* 2D matrix of size (M x K).")
    .Input(
        1,
        "B",
        "*(type: Tensor`<float>`)* 2D matrix of size (K x N).")
    .Output(
        0,
        "Y",
        "*(type: Tensor`<float>`)* 2D matrix of size (M x N).")
    .Arg(
        "axis_a",
-        "Exclusive axis that divides the first and second dimension \
+        "*(type: int; default: 1)* Exclusive axis that divides the first and "
-of matrix A, default to 1")
+        "second dimension of matrix `A`.")
    .Arg(
        "axis_b",
-        "Exclusive axis that divides the first and second dimension \
+        "*(type: int; default: 1)* Exclusive axis that divides the first and "
-of matrix B, default to 1")
+        "second dimension of matrix `B`.")
    .Arg(
        "trans_a",
-        "Pass 1 to transpose A before multiplication and after the \
+        "*(type: int; default: 0)* Pass 1 to transpose `A` before multiplication and "
-dimension adjustment using axis_a")
+        "after the dimension adjustment using `axis_a`.")
    .Arg(
        "trans_b",
-        "Pass 1 to transpose B before multiplication and after the \
+        "*(type: int; default: 0)* Pass 1 to transpose `B` before multiplication and "
-dimension adjustment using axis_b");
+        "after the dimension adjustment using `axis_b`.");
 class GetMatMulGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
--- a/caffe2/operators/mean_op.cc
+++ b/caffe2/operators/mean_op.cc
@ -11,13 +11,70 @@ OPERATOR_SCHEMA(Mean)
    .IdenticalTypeAndShapeOfInput(0)
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Element-wise mean of each of the input tensors. The first input tensor can be
+Element-wise mean of an arbitrary number of input tensors. This operation can be
-used in-place as the output tensor, in which case the mean will be done in
+performed in-place, by using the first input blob as the output blob. All inputs
-place and results will be accumulated in input0. All inputs and outputs must
+must have the same shape and data type, and the output will have the same shape
-have the same shape and data type.
+as the inputs.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/mean_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Mean",
    ["X", "Y", "Z"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
 workspace.FeedBlob("Y", (np.random.rand(3,3)).astype(np.float32))
 workspace.FeedBlob("Z", (np.random.rand(3,3)).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 print("Y:", workspace.FetchBlob("Y"))
 print("Z:", workspace.FetchBlob("Z"))
 workspace.RunOperatorOnce(op)
 print("Mean:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X:
 [[0.6035237  0.5305746  0.6298913 ]
 [0.9169737  0.01280353 0.16286302]
 [0.6017664  0.9946255  0.05128575]]
 Y:
 [[0.07544111 0.45371833 0.08460239]
 [0.9708728  0.7422064  0.7933344 ]
 [0.97671497 0.3411384  0.73818344]]
 Z:
 [[0.08837954 0.90187573 0.46734726]
 [0.6308827  0.8719029  0.39888734]
 [0.90059936 0.92883426 0.5695987 ]]
 Mean:
 [[0.25578147 0.6287229  0.39394698]
 [0.8395764  0.5423043  0.45169494]
 [0.8263602  0.75486606 0.45302266]]
 ```
 </details>
 )DOC")
-    .Input(0, "data_0", "First of the input tensors. Can be inplace.")
+    .Input(0, "X, Y, ...", "*(type: Tensor`<Ord>`)* List of input tensors with the same shape.")
-    .Output(0, "mean", "Output tensor. Same dimension as inputs.");
+    .Output(0, "M", "*(type: Tensor`<Ord>`)* Output tensor with the same dimensions as inputs. Contains "
    "the mean values of the input tensors calculated element-wise.");
 class GetMeanGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
--- a/caffe2/operators/minmax_ops.cc
+++ b/caffe2/operators/minmax_ops.cc
@ -11,13 +11,70 @@ OPERATOR_SCHEMA(Max)
    .IdenticalTypeAndShapeOfInput(0)
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Element-wise max of each of the input tensors. The first input tensor can be
+Element-wise max of an arbitrary number of input tensors. This operation can be
-used in-place as the output tensor, in which case the max will be done in
+performed in-place, by using the first input blob as the output blob. All inputs
-place and results will be accumulated in input0. All inputs and outputs must
+must have the same shape and data type, and the output will have the same shape
-have the same shape and data type.
+as the inputs.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/minmax_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Max",
    ["X", "Y", "Z"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.rand(3,3)).astype(np.float32))
 workspace.FeedBlob("Y", (np.random.rand(3,3)).astype(np.float32))
 workspace.FeedBlob("Z", (np.random.rand(3,3)).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 print("Y:", workspace.FetchBlob("Y"))
 print("Z:", workspace.FetchBlob("Z"))
 workspace.RunOperatorOnce(op)
 print("Max:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X:
 [[0.4496477  0.07061381 0.7139333 ]
 [0.83203    0.05970785 0.72786295]
 [0.75988126 0.04601283 0.32820013]]
 Y:
 [[0.05683139 0.16872478 0.671098  ]
 [0.70739156 0.09878621 0.03416285]
 [0.34087983 0.94986707 0.67263436]]
 Z:
 [[0.48051122 0.07141234 0.85264146]
 [0.77086854 0.22082241 0.13154659]
 [0.42401117 0.995431   0.4263775 ]]
 Max:
 [[0.48051122 0.16872478 0.85264146]
 [0.83203    0.22082241 0.72786295]
 [0.75988126 0.995431   0.67263436]]
 ```
 </details>
 )DOC")
-    .Input(0, "data_0", "First of the input tensors. Can be inplace.")
+    .Input(0, "X, Y, ...", "*(type: Tensor`<Ord>`)* List of input tensors with the same shape.")
-    .Output(0, "max", "Output tensor. Same dimension as inputs.")
+    .Output(0, "M", "*(type: Tensor`<Ord>`)* Output tensor with same dimensions as input(s)."
    "Contains the maximum valued element at each location.")
    .InheritOnnxSchema("Max");
 OPERATOR_SCHEMA(Min)
@ -26,13 +83,63 @@ OPERATOR_SCHEMA(Min)
    .IdenticalTypeAndShapeOfInput(0)
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Element-wise min of each of the input tensors. The first input tensor can be
+Element-wise min of an arbitrary number of input tensors. This operation can be performed in-place, by using the first input blob as the output blob. All inputs must have the same shape and data type, and the output will have the same shape as the inputs.
-used in-place as the output tensor, in which case the min will be done in
+
-place and results will be accumulated in input0. All inputs and outputs must
+Github Link:
-have the same shape and data type.
+- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/minmax_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Min",
    ["X", "Y", "Z"],
    ["X"],
 )
 workspace.FeedBlob("X", (np.random.rand(2,2)).astype(np.float32))
 workspace.FeedBlob("Y", (np.random.rand(2,2)).astype(np.float32))
 workspace.FeedBlob("Z", (np.random.rand(2,2)).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 print("Y:", workspace.FetchBlob("Y"))
 print("Z:", workspace.FetchBlob("Z"))
 workspace.RunOperatorOnce(op)
 print("Min:", workspace.FetchBlob("X"))
 ```
 **Result**
 ```
 X:
 [[0.32731926 0.4939747 ]
 [0.29242373 0.43460014]]
 Y:
 [[0.40928316 0.916115  ]
 [0.77526504 0.29339448]]
 Z:
 [[0.7899794  0.90335774]
 [0.82599413 0.2843068 ]]
 Min:
 [[0.32731926 0.4939747 ]
 [0.29242373 0.2843068 ]]
 ```
 </details>
 )DOC")
-    .Input(0, "data_0", "First of the input tensors. Can be inplace.")
+    .Input(0, "X, Y, ...", "*(type: Tensor`<Ord>`)* List of input tensors with the same shape.")
-    .Output(0, "min", "Output tensor. Same dimension as inputs.")
+    .Output(0, "M", "*(type: Tensor`<Ord>`)* Output tensor with same dimensions as input(s)."
 "Contains the minimum valued element at each location.")
    .InheritOnnxSchema("Min");
 template <typename T, class Context>
--- a/caffe2/operators/mod_op.cc
+++ b/caffe2/operators/mod_op.cc
@ -32,20 +32,68 @@ REGISTER_CPU_OPERATOR(Mod, ModOp<CPUContext>);
 OPERATOR_SCHEMA(Mod)
    .NumInputs(1)
    .NumOutputs(1)
-    .Arg("divisor", "The divisor of the modulo operation. Must >= 1")
+    .Arg("divisor", "*(type: int; default: 0)* Divisor of the modulo operation (must be >= 1).")
    .Arg(
        "sign_follow_divisor",
-        "The sign of output follows Dividend if set to `false`. \
+        "*(type: bool; default: False)* If true, sign of output matches divisor, else if false, sign follows dividend.")
          Otherwise follows Divisor")
    .IdenticalTypeAndShape()
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Elementwise modulo operation. Each element in the output is the modulo result
+Element-wise modulo operation. Each element in the output is the modulo result
-of the corresponding elment in the input data. The divisor of the modulo is
+of the corresponding element in the input data. The divisor of the modulo is
-provided by the operator argument `divisor`.
+provided by the `divisor` argument.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/mod_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Mod",
    ["X"],
    ["Y"],
    divisor=10
 )
 workspace.FeedBlob("X", (np.random.randint(100, size=(5,5))))
 print("X before running op:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("X after running op:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X before running op:
 [[56 22 43 13 60]
 [ 4 55 58 10 45]
 [64 66  4  3 66]
 [10 36 47 52 78]
 [91  4 36 47 95]]
 X after running op:
 [[6 2 3 3 0]
 [4 5 8 0 5]
 [4 6 4 3 6]
 [0 6 7 2 8]
 [1 4 6 7 5]]
 ```
 </details>
 )DOC")
-    .Input(0, "data", "input int32 or int64 data")
+    .Input(0, "X", "*(type: Tensor`<int>`)* Input tensor with int32 or int64 data.")
-    .Output(0, "output", "output of data with modulo operation applied");
+    .Output(0, "Y", "*(type: Tensor`<int>`)* Output tensor of data with modulo operation applied.");
 SHOULD_NOT_DO_GRADIENT(ModOp);
 } // namespace
--- a/caffe2/operators/negative_op.cc
+++ b/caffe2/operators/negative_op.cc
@ -17,9 +17,48 @@ OPERATOR_SCHEMA(Negative)
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
 Computes the element-wise negative of the input.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/negative_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Negative",
    ["X"],
    ["Y"]
 )
 workspace.FeedBlob("X", (np.random.rand(3,3).astype(np.float32)))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [[0.83296907 0.61407167 0.32562155]
 [0.59304523 0.03111175 0.29365504]
 [0.09478621 0.5424558  0.73940724]]
 Y: [[-0.83296907 -0.61407167 -0.32562155]
 [-0.59304523 -0.03111175 -0.29365504]
 [-0.09478621 -0.5424558  -0.73940724]]
 ```
 </details>
 )DOC")
-    .Input(0, "X", "1D input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* 1D input tensor.")
-    .Output(0, "Y", "1D input tensor")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* 1D output tensor.")
    .InheritOnnxSchema("Neg");
 namespace {
--- a/caffe2/operators/pool_op.cc
+++ b/caffe2/operators/pool_op.cc
@ -728,19 +728,141 @@ bool PoolOp<T, Context, PoolType>::RunOnDeviceWithOrderNHWC() {
  return true;
 }
 const char* kAveragePoolDoc = R"DOC(
-consumes an input blob X and applies average pooling across the
+consumes an input blob and applies average pooling across the the blob according
-the blob according to kernel sizes, stride sizes, and pad lengths defined by the
+to kernel sizes, stride sizes, pad lengths and dilation. Average pooling consists
-ConvPoolOpBase operator. Average pooling consisting of averaging all values of a
+of taking the average value of a subset of the input tensor according to the kernel
-subset of the input tensor according to the kernel size and downsampling the
+size and downsampling the data into the output blob for further processing. The
-data into the output blob Y for further processing.
+`brew` module has a wrapper for this operator for use in a `ModelHelper` object.
 Pooling layers reduce the spatial dimensionality of the input blob. Each of the
 output blob's dimensions will reduce according to:
 $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "AveragePool",
    ["X"],
    ["Y"],
    kernel=2,
    stride=2,
 )
 workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X:
 [[[[-0.2883434   0.43498734  0.05417408  1.912558    0.09390241
    -0.33173105]
   [ 1.633709    1.2047161   0.36964908  0.99961185  0.4184147
     0.9989975 ]
   [ 1.7644193   0.1789665   1.5812988  -0.6038542  -0.36090398
     0.33195344]
   [ 0.9457722  -0.95174325 -0.78124577  1.2062047   1.1903144
     0.2586746 ]
   [ 1.252104    0.32645547  1.8073524  -0.78397465  0.9978303
    -0.97614396]
   [ 0.5440196   1.5778259  -0.76750124  0.5051756   0.8838398
    -0.37085298]]]]
 Y:
 [[[[0.7462672  0.83399826 0.2948959 ]
   [0.4843537  0.3506009  0.35500962]
   [0.9251013  0.19026303 0.13366827]]]]
 ```
 </details>
 )DOC";
 const char* kMaxPoolDoc = R"DOC(
-consumes an input blob X and applies max pooling across the
+consumes an input blob and applies max pooling across the the blob according to
-the blob according to kernel sizes, stride sizes, and pad lengths defined by the
+kernel sizes, stride sizes, pad lengths and dilation. Max pooling consists of
-ConvPoolOpBase operator. Max pooling consisting of taking the maximum value of a
+taking the maximum value of a subset of the input tensor according to the kernel
-subset of the input tensor according to the kernel size and downsampling the
+size and downsampling the data into the output blob for further processing. The
-data into the output blob Y for further processing.
+`brew` module has a wrapper for this operator for use in a `ModelHelper` object.
 Pooling layers reduce the spatial dimensionality of the input blob. Each of the
 output blob's dimensions will reduce according to:
 $$dim_{out}=\frac{dim_{in}-kernel+2*pad}{stride}+1$$
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/pool_op.cc
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/conv_pool_op_base.h
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "MaxPool",
    ["X"],
    ["Y"],
    kernel=2,
    stride=2,
 )
 workspace.FeedBlob("X", np.random.randn(1, 1, 6, 6).astype(np.float32)) # NCHW
 print("X:\n", workspace.FetchBlob("X"), "\n")
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X:
 [[[[-2.8534958e-01 -1.7719941e+00 -8.2277227e-04  1.1088650e+00
    -2.1476576e+00 -3.5070452e-01]
   [-9.0058845e-01 -3.0070004e-01 -1.7907504e+00 -7.1746534e-01
     1.2798511e+00 -3.2214901e-01]
   [ 1.5806322e+00  1.6845188e+00 -2.6633200e-01 -3.8576153e-01
    -9.6424848e-02 -3.9696163e-01]
   [ 1.2572408e-01  6.3612902e-01 -3.9554062e-01 -6.9735396e-01
    -9.1898698e-01 -1.9609968e-01]
   [-1.1587460e+00  2.4605224e+00 -1.5497679e+00  1.3020347e-01
    -8.1293899e-01 -7.8803545e-01]
   [ 1.4323474e+00  1.3618395e+00  9.8975077e-02 -1.1307785e-01
     7.2035044e-01  2.7642491e-01]]]]
 Y:
 [[[[-0.28534958  1.108865    1.2798511 ]
   [ 1.6845188  -0.266332   -0.09642485]
   [ 2.4605224   0.13020347  0.72035044]]]]
 ```
 </details>
 )DOC";
 std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
@ -752,18 +874,18 @@ std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
    schema.Input(
        0,
        "X",
-        "Input data tensor from the previous operator; dimensions depend on "
+        "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
        "whether the NCHW or NHWC operators are being used. For example, in "
        "the former, the input has size (N x C x H x W), where N is the batch "
        "size, C is the number of channels, and H and W are the height and the "
        "width of the data. The corresponding permutation of dimensions is "
        "used in the latter case.");
    schema.Output(
        0,
        "Y",
-        "Output data tensor from average pooling across the input "
+        "*(type: Tensor`<float>`)* Output data tensor.");
-        "tensor. Dimensions will vary based on various kernel, stride, and pad "
+    /*
-        "sizes.");
+    schema.Arg("kernel", "*(type: int)* Size of the window to take an average over.");
    schema.Arg("stride", "*(type: int)* Stride of the window.");
    schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both sides.");
    schema.Arg("dilation", "*(type: int)* Parameter that controls the stride of elements in the window.");
    schema.Arg("order", "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
    */
  };
 }
@ -776,18 +898,18 @@ std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) {
    schema.Input(
        0,
        "X",
-        "Input data tensor from the previous operator; dimensions depend on "
+        "*(type: Tensor`<float>`)* Input data tensor of shape NCHW or NHWC.");
        "whether the NCHW or NHWC operators are being used. For example, in "
        "the former, the input has size (N x C x H x W), where N is the batch "
        "size, C is the number of channels, and H and W are the height and the "
        "width of the data. The corresponding permutation of dimensions is "
        "used in the latter case.");
    schema.Output(
        0,
        "Y",
-        "Output data tensor from max pooling across the input "
+        "*(type: Tensor`<float>`)* Output data tensor.");
-        "tensor. Dimensions will vary based on various kernel, stride, and pad "
+    /*
-        "sizes.");
+    schema.Arg("kernel", "*(type: int)* Size of the window to take an average over.");
    schema.Arg("stride", "*(type: int)* Stride of the window.");
    schema.Arg("pad", "*(type: int)* Implicit zero padding to be added on both sides.");
    schema.Arg("dilation", "*(type: int)* Parameter that controls the stride of elements in the window.");
    schema.Arg("order", "*(type: string; default: 'NCHW')* Order of the blob dimensions.");
    */
  };
 }
 REGISTER_CPU_OPERATOR(
--- a/caffe2/operators/prepend_dim_op.h
+++ b/caffe2/operators/prepend_dim_op.h
@ -28,9 +28,7 @@ class PrependDimOp : public Operator<Context> {
    CAFFE_ENFORCE(
        input.dim(0) % dim_size_ == 0,
        "First dimension must be multiple of prepend_dim. Current first dimension: ",
-        input.dim(0),
+        input.dim(0));
        ", prepend dim: ",
        dim_size_);
    vector<int64_t> actual_new_shape(input.ndim() + 1);
    actual_new_shape[0] = dim_size_;
--- a/caffe2/operators/reshape_op.cc
+++ b/caffe2/operators/reshape_op.cc
@ -105,22 +105,76 @@ OPERATOR_SCHEMA(Reshape)
        })
    .AllowInplace({{0, 0}})
    .SetDoc(R"DOC(
-Reshape the input tensor similar to numpy.reshape.
+Reshape the input tensor similar to numpy's
 [reshape](https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html).
-It takes a tensor as input and an optional tensor specifying the new shape.
+Takes a tensor as input and an optional tensor specifying the new shape. When
-When the second input is absent, an extra argument `shape` must be specified.
+the second input is absent, an extra argument shape must be specified. Outputs
-It outputs the reshaped tensor as well as the original shape.
+the reshaped tensor as well as the original shape.
 At most one dimension of the new shape can be -1. In this case, the value is
 inferred from the size of the tensor and the remaining dimensions. A dimension
 could also be 0, in which case the actual dimension value is going to be copied
 from the input tensor.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reshape_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Reshape",
    ["data"],
    ["reshaped", "old_shape"],
    shape=(3,2)
 )
 workspace.FeedBlob("data", (np.random.randint(100, size=(6))))
 print("data:", workspace.FetchBlob("data"))
 workspace.RunOperatorOnce(op)
 print("reshaped:", workspace.FetchBlob("reshaped"))
 print("old_shape:", workspace.FetchBlob("old_shape"))
 ```
 **Result**
 ```
 data: [86 60 85 96  7 37]
 reshaped: [[86 60]
          [85 96]
          [ 7 37]]
 old_shape: [6]
 ```
 </details>
 )DOC")
-    .Arg("shape", "New shape")
+    .Arg("shape", "*(type: Tuple(int))* New shape. Do not set if using "
-    .Input(0, "data", "An input tensor.")
+    "`new_shape` input.")
-    .Input(1, "new_shape", "New shape.")
+    .Input(
-    .Output(0, "reshaped", "Reshaped data.")
+        0,
-    .Output(1, "old_shape", "Original shape.")
+        "data",
        "*(type: Tensor)* Input tensor.")
    .Input(
        1,
        "new_shape",
        "*(type: Tensor`<int>`)* [OPTIONAL] Tensor containing new shape.")
    .Output(
        0,
        "reshaped",
        "*(type: Tensor)* Reshaped output tensor.")
    .Output(
        1,
        "old_shape",
        "*(type: Tensor`<int>`)* Tensor containing old shape of `data`.")
    .InheritOnnxSchema("Reshape");
 class GetReshapeGradient : public GradientMakerBase {
--- a/caffe2/operators/reshape_op.h
+++ b/caffe2/operators/reshape_op.h
@ -48,7 +48,6 @@ class ReshapeOp : public Operator<Context> {
      auto& shape = Input(1);
      CAFFE_ENFORCE(shape.ndim() == 1, "Shape should be 1-D");
      if (shape.size()) {
      const T* shape_data = shape.template data<T>();
      // Bit awkward, but needed so works on both CPU and CUDA contexts
@ -57,7 +56,6 @@ class ReshapeOp : public Operator<Context> {
          shape.size() * sizeof(T), shape_data, &tmpv[0]);
      actual_new_shape.assign(tmpv.begin(), tmpv.begin() + shape.size());
    }
    }
    // Copy over the dimensions for those that are specified zero.
    for (int i = 0; i < actual_new_shape.size() && i < input.ndim(); ++i) {
--- a/caffe2/operators/sequence_ops.cc
+++ b/caffe2/operators/sequence_ops.cc
@ -285,60 +285,209 @@ OPERATOR_SCHEMA(AddPadding)
    .NumInputs(1, 4)
    .NumOutputs(1, 2)
    .SetDoc(R"DOC(
-Given a partitioned tensor T<N, D1..., Dn>, where the partitions are
+Given a partitioned tensor $T<N, D_1, ..., D_n>$, where the partitions are
-defined as ranges on its outer-most (slowest varying) dimension N,
+defined as ranges on its outer-most (slowest varying) dimension $N$,
-with given range lengths, return a tensor T<N + 2*padding_width, D1 ..., Dn>
+return a tensor $T<(N + 2 * padding\_width), D_1, ..., D_n>$ with paddings
-with paddings added to the start and end of each range.
+added to the start and end of each range.
-Optionally, different paddings can be provided for beginning and end. Paddings
+
-provided must be a tensor T<D1..., Dn>.
+Optionally, different paddings can be provided for beginning and end.
 Paddings provided must be a tensor $T<D_1, ..., D_n>$. If no padding is
 provided, add zero padding. If no lengths vector is provided, add padding
 only once, at the start and end of data.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sequence_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "AddPadding",
    ["X", "lengths"],
    ["Y", "lengths_out"],
    padding_width=1
 )
 workspace.FeedBlob("X", (np.random.rand(3,2,2).astype(np.float32)))
 workspace.FeedBlob("lengths", np.array([3]).astype(np.int32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 print("lengths_out:", workspace.FetchBlob("lengths_out"))
 ```
 **Result**
 ```
 X: [[[0.2531572  0.4588472 ]
  [0.45140603 0.61161053]]
 [[0.92500854 0.8045306 ]
  [0.03356671 0.30233648]]
 [[0.4660227  0.6287745 ]
  [0.79372746 0.08609265]]]
 Y: [[[0.         0.        ]
  [0.         0.        ]]
 [[0.2531572  0.4588472 ]
  [0.45140603 0.61161053]]
 [[0.92500854 0.8045306 ]
  [0.03356671 0.30233648]]
 [[0.4660227  0.6287745 ]
  [0.79372746 0.08609265]]
 [[0.         0.        ]
  [0.         0.        ]]]
 lengths_out: [5]
 ```
 </details>
 If no padding is provided, add zero padding.
 If no lengths vector is provided, add padding only once,
 at the start and end of data.
 )DOC")
    .Arg(
        "padding_width",
-        "Number of copies of padding to add around each range.")
+        "*(type: int)* Number of copies of padding to add around each range.")
    .Arg(
        "end_padding_width",
-        "(Optional) Specifies a different end-padding width.")
+        "*(type: int)* [OPTIONAL] Specifies a different end-padding width. If "
-    .Input(0, "data_in", "(T<N, D1..., Dn>) Input data")
+        "this is not set, will use same as `padding_width`.")
    .Input(
        0,
        "data_in",
        "*(type: Tensor)* Input data ($T<N, D_1, ..., D_n>$).")
    .Input(
        1,
        "lengths",
-        "(i64) Num of elements in each range. sum(lengths) = N.")
+        "*(type: Tensor`<int>`)* Number of elements in each range. "
-    .Input(2, "start_padding", "T<D1..., Dn> Padding data for range start.")
+        "sum(lengths) = N.")
    .Input(
        2,
        "start_padding",
        "*(type: Tensor`<int>`)* [OPTIONAL] Padding data for range start "
        "($T<D_1, ..., D_n>$).")
    .Input(
        3,
        "end_padding",
-        "T<D1..., Dn> (optional) Padding for range end. "
+        "*(type: Tensor`<int>`)* [OPTIONAL] Padding for range end. If not "
-        "If not provided, start_padding is used as end_padding as well.")
+        "provided, `start_padding` is used ($T<D_1, ..., D_n>$).")
-    .Output(0, "data_out", "(T<N + 2*padding_width, D1..., Dn>) Padded data.")
+    .Output(
-    .Output(1, "lengths_out", "(i64, optional) Lengths for each padded range.");
+        0,
        "data_out",
        "*(type: Tensor)* Padded data tensor ($T<N + 2*padding\_width, "
        "D_1, ..., D_n>$).")
    .Output(
        1,
        "lengths_out",
        "*(type: Tensor`<int>`)* [OPTIONAL] Lengths for each padded range.");
 OPERATOR_SCHEMA(RemovePadding)
    .NumInputs(1, 2)
    .NumOutputs(1, 2)
    .SetDoc(R"DOC(
-Remove padding around the edges of each segment of the input data. This is
+Remove padding around the edges of each segment of the input data. This is the
-the reverse opration of AddPadding, and uses the same arguments and conventions
+reverse operation of **AddPadding**, and uses the same arguments and conventions
 for input and output data format.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sequence_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 addpad_op = core.CreateOperator(
    "AddPadding",
    ["X", "lengths_add"],
    ["Y", "lengths_out_add"],
    padding_width=1
 )
 rmpad_op = core.CreateOperator(
    "RemovePadding",
    ["Y", "lengths_rm"],
    ["Z", "lengths_out_rm"],
    padding_width=1
 )
 workspace.FeedBlob("X", (np.random.randint(20, size=(3,5))))
 workspace.FeedBlob("lengths_add", np.array([3]).astype(np.int32))
 workspace.FeedBlob("lengths_rm", np.array([5]).astype(np.int32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(addpad_op)
 print("Y:", workspace.FetchBlob("Y"))
 print("lengths_out_add:", workspace.FetchBlob("lengths_out_add"))
 workspace.RunOperatorOnce(rmpad_op)
 print("Z:", workspace.FetchBlob("Z"))
 print("lengths_out_rm:", workspace.FetchBlob("lengths_out_rm"))
 ```
 **Result**
 ```
 X: [[17 19  1  9  1]
 [19  3  5 19  1]
 [16  0  0  0  4]]
 Y: [[ 0  0  0  0  0]
 [17 19  1  9  1]
 [19  3  5 19  1]
 [16  0  0  0  4]
 [ 0  0  0  0  0]]
 lengths_out_add: [5]
 Z: [[17 19  1  9  1]
 [19  3  5 19  1]
 [16  0  0  0  4]]
 lengths_out_rm: [3]
 ```
 </details>
 )DOC")
-    .Arg("padding_width", "Outer-size of padding to remove around each range.")
+    .Arg(
        "padding_width",
        "*(type: int)* Outer-size of padding to remove around each range.")
    .Arg(
        "end_padding_width",
-        "(Optional) Specifies a different end-padding width.")
+        "*(type: int)* [OPTIONAL] Specifies a different end-padding width. "
-    .Input(0, "data_in", "T<N, D1..., Dn> Input data")
+        "If this is not set, will use same as `padding_width`.")
    .Input(
        0,
        "data_in",
        "Input tensor ($T<N, D_1, ..., D_n>$).")
    .Input(
        1,
        "lengths",
-        "(i64) Num of elements in each range. sum(lengths) = N. "
+        "*(type: Tensor`<int>`)* Number of elements in each range. "
-        "If not provided, considers all data as a single segment.")
+        "sum(lengths) = N. If not provided, considers all data as a single "
-    .Output(0, "data_out", "(T<N - 2*padding_width, D1..., Dn>) Unpadded data.")
+        "segment.")
    .Output(
        0,
        "data_out",
        "*(type: Tensor)* Padded data tensor "
        "($T<N + 2*padding\_width, D_1, ..., D_n>$).")
    .Output(
        1,
        "lengths_out",
-        "(i64, optional) Lengths for each unpadded range.");
+        "*(type: Tensor`<int>`)* [OPTIONAL] Lengths for each padded range.");
 OPERATOR_SCHEMA(GatherPadding)
    .NumInputs(2)
--- a/caffe2/operators/shape_op.cc
+++ b/caffe2/operators/shape_op.cc
@ -9,9 +9,9 @@ OPERATOR_SCHEMA(Shape)
    .NumOutputs(1)
    .Arg(
        "axes",
-        "(int[]) array of interested axes."
+        "*(type: int[])* Array of interested axes."
-        "If given, this operators only returns the dimension of given axes."
+        "If given, this operator only returns the dimensions of the given axes."
-        "Otherwise, the operator returns full dimension.")
+        "Otherwise, the operator returns the dimensions of all axes.")
    .TensorInferenceFunction([](const OperatorDef& def,
                                const vector<TensorShape>& in) {
      ArgumentHelper args(def);
@ -27,8 +27,51 @@ OPERATOR_SCHEMA(Shape)
    })
    .SetDoc(R"DOC(
 Produce a 1D int64 tensor with the shape of the input tensor.
-        If called with an optional argument \"axes\", the result will only
+If called with an optional argument `axes`, the result will only
-        contain the dimension of specified axes in particular order.)DOC");
+contain the dimensions of specified axes.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/shape_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Shape",
    ["X"],
    ["shape"],
 )
 workspace.FeedBlob("X", (np.random.randint(10, size=(2,3))))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("shape:", workspace.FetchBlob("shape"))
 ```
 **Result**
 ```
 X:
 [[3 2 5]
 [5 7 3]]
 shape: [2 3]
 ```
 </details>
      )DOC")
    .Input(0,"X", "*(type: Tensor)* Input tensor.")
    .Output(0,"shape", "*(type: Tensor)* Output tensor containing shape of input tensor.");
 SHOULD_NOT_DO_GRADIENT(Shape);
--- a/caffe2/operators/sigmoid_op.cc
+++ b/caffe2/operators/sigmoid_op.cc
@ -25,12 +25,55 @@ OPERATOR_SCHEMA(Sigmoid)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
-Sigmoid takes one input data (Tensor<T>) and produces one output data
+Apply the Sigmoid function element-wise to the input tensor. This is often used
-(Tensor<T>) where the sigmoid function, y = 1 / (1 + exp(-x)), is applied to the
+as a non-linear activation function in a neural network. The sigmoid function is
-tensor elementwise.
+defined as:
 $$Sigmoid(x) = \frac{1}{1+\exp(-x)}$$
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sigmoid_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sigmoid",
    ["X"],
    ["Y"]
 )
 workspace.FeedBlob("X", np.random.randn(5).astype(np.float32))
 print("input:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("sigmoid:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 input: [ 1.5744036   0.31632107  1.7842269   1.4450722  -2.1726978 ]
 sigmoid: [0.8284105  0.57842743 0.85621804 0.80923885 0.10222916]
 ```
 </details>
 )DOC")
-    .Input(0, "X", "1D input tensor")
+    .Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "Y", "1D output tensor")
+    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
    .InheritOnnxSchema("Sigmoid");
 // Input: Y, dY, output: dX
 OPERATOR_SCHEMA(SigmoidGradient)
--- a/caffe2/operators/sin_op.cc
+++ b/caffe2/operators/sin_op.cc
@ -38,9 +38,52 @@ OPERATOR_SCHEMA(Sin)
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
 Calculates the sine of the given input tensor, element-wise.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sin_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sin",
    ["X"],
    ["Y"]
 )
 workspace.FeedBlob("X", np.random.rand(5).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X: [0.8466114  0.1803606  0.5601509  0.04959291 0.64770824]
 Y: [0.74903965 0.17938434 0.5313141  0.04957259 0.60336035]
 ```
 </details>
 )DOC")
-    .Input(0, "input", "Input tensor")
+.Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
-    .Output(0, "output", "The sine of the input tensor computed element-wise");
+.Output(
    0,
    "Y",
    "*(type: Tensor`<float>`)* Output tensor calculated as the sine of the input tensor, element-wise.");
 OPERATOR_SCHEMA(SinGradient).NumInputs(2).NumOutputs(1).IdenticalTypeAndShape();
--- a/caffe2/operators/softmax_op.cc
+++ b/caffe2/operators/softmax_op.cc
@ -83,31 +83,75 @@ OPERATOR_SCHEMA(Softmax)
  .NumOutputs(1)
  .IdenticalTypeAndShape()
  .SetDoc(R"DOC(
 The operator computes the softmax normalized values for each layer in the batch
 of the given input. The input is a 2-D tensor (Tensor<float>) of size
 (batch_size x input_feature_dimensions). The output tensor has the same shape
 and contains the softmax normalized values of the corresponding input.
-X does not need to explicitly be a 2D vector; rather, it will be
+Applies the Softmax function to an n-dimensional input Tensor rescaling them so 
-coerced into one. For an arbitrary n-dimensional tensor
+that the elements of the n-dimensional output Tensor lie in the range (0,1) and 
-X \in [a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}] and k is
+sum to 1. The softmax operator is typically the last layer in a classifier network,
-the axis provided, then X will be coerced into a 2-dimensional tensor with
+as its output can be interpreted as confidence probabilities of an input belonging
-dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default
+to each class. The input is a 2-D tensor (Tensor) of size (batch_size x 
-case where axis=1, this means the X tensor will be coerced into a 2D tensor
+input_feature_dimensions). The output tensor has the same shape and contains the 
-of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size.
+softmax normalized values of the corresponding input. The softmax function is 
-In this situation, we must have a_0 = N and a_1 * ... * a_{n-1} = D.
+defined as follows:
-Each of these dimensions must be matched correctly, or else the operator
+
-will throw errors.
+$$softmax(x_i) = \frac{\exp(x_i)}{\sum_{j} \exp(x_j)}$$
 The input does not need to explicitly be a 2D vector; rather, it will be coerced
 into one. For an arbitrary n-dimensional tensor `X` in 
 $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided, 
 then `X` will be coerced into a 2-dimensional tensor with dimensions 
 $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where
 `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions 
 $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this
 situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these 
 dimensions must be matched correctly, or else the operator will throw errors.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Softmax",
    ["X"],
    ["Y"]
 )
 workspace.FeedBlob("X", np.random.randn(1, 5).astype(np.float32))
 print("input:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("softmax:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 input: [[ 0.0417839   0.61960053 -0.23150268 -0.64389366 -3.0000346 ]]
 softmax: [[0.24422921 0.43525138 0.18582782 0.12303016 0.01166145]]
 ```
 </details>
 )DOC")
  .Arg("axis",
-       "(int) default to 1; describes the axis of the inputs when coerced "
+       "*(type: int; default: 1)* Axis of the inputs when coerced to 2D matrix.")
-       "to 2D; defaults to one because the 0th axis most likely describes "
+  .Input(0, "X",
-       "the batch_size")
+         "*(type: Tensor`<float>`)* Input tensor that's coerced into a 2D matrix of size (NxD) as described above.")
-  .Input(0, "input",
+  .Output(0, "Y",
-         "The input tensor that's coerced into a 2D matrix of size (NxD) "
+	 "*(type: Tensor`<float>`)* The softmax normalized output tensor with the same shape as input tensor.")
         "as described above.")
  .Output(0, "output", "The softmax normalized output values with the same "
          "shape as input tensor.")
  .InheritOnnxSchema("Softmax");
 // Input: Y, dY. Output: dX
--- a/caffe2/operators/softmax_with_loss_op.cc
+++ b/caffe2/operators/softmax_with_loss_op.cc
@ -35,26 +35,115 @@ OPERATOR_SCHEMA(SoftmaxWithLoss)
          return out;
        })
    .SetDoc(R"DOC(
-Combined Softmax and Cross-Entropy loss operator.
+Combined Softmax and Cross-Entropy loss operator. The operator first computes the softmax normalized values for each layer in the batch of the given input, then computes cross-entropy loss. This operator is numerically more stable than separate `Softmax` and `CrossEntropy` ops. The inputs are a 2-D tensor `logits` of size (batch_size x input_feature_dimensions), which represents the unscaled log probabilities, and a 1-dimensional integer `labels` tensor for ground truth. An optional third input blob (`weight_tensor`) can be used to weight the samples for the loss, which is useful if the training set is unbalanced. This operator outputs a `softmax` tensor which contains the probability for each label for each example (same shape is `logits` input), and a scalar `loss` value, which is the averaged cross-entropy loss between the softmax probabilities and the ground truth values. Use parameter `label_prob`=1 to enable inputting labels as a probability distribution.
-The operator computes the softmax normalized values for each layer in the batch
+
-of the given input, after which cross-entropy loss is computed. This operator is
+Softmax cross-entropy loss function:
-numerically more stable than separate Softmax and CrossEntropy ops.
+
-The inputs are a 2-D tensor (Tensor<float>) of size
+$$loss(x, class) = -\log{\biggl(\frac{\exp(x[class])}{\sum_{j} \exp(x[j])}\biggr)} = -x[class] + \log{\biggl(\sum_{j} \exp(x[j])\biggr)}$$
-(batch_size x input_feature_dimensions) and tensor of labels (ground truth).
+
-Output is tensor with the probability for each label for each example (N x D)
+or if the `weight_tensor` has been passed:
-and averaged loss (scalar).
+
-Use parameter label_prob=1 to enable inputting labels as a probability
+$$loss(x, class) = weight[class]\biggl(-x[class] + \log{\biggl(\sum_{j} \exp(x[j])\biggr)}\biggr)$$
-distribution.
+
-Optional third input blob can be used to weight the samples for the loss.
+The `logits` input does not need to explicitly be a 2D vector; rather, it will be coerced into one. For an arbitrary n-dimensional tensor `X` in $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided, then `X` will be coerced into a 2-dimensional tensor with dimensions $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these dimensions must be matched correctly, or else the operator will throw errors.
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_with_loss_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "SoftmaxWithLoss",
    ["logits", "labels"],
    ["softmax", "avgloss"]
 )
 workspace.FeedBlob("logits", np.random.randn(1, 5).astype(np.float32))
 workspace.FeedBlob("labels", np.asarray([4]).astype(np.int32))
 print("logits:", workspace.FetchBlob("logits"))
 print("labels:", workspace.FetchBlob("labels"))
 workspace.RunOperatorOnce(op)
 print("softmax:", workspace.FetchBlob("softmax"))
 print("avgloss:", workspace.FetchBlob("avgloss"))
 ```
 **Result**
 ```
 logits: [[-0.3429451  -0.80375195  0.23104447  1.4569176  -0.5268362 ]]
 labels: [4]
 softmax: [[0.09721052 0.0613179  0.17258129 0.58800864 0.0808817 ]]
 avgloss: 2.5147676
 ```
 </details>
 <details>
 <summary> <b>Example 2</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "SoftmaxWithLoss",
    ["logits", "labels"],
    ["softmax", "avgloss"],
    scale=5.0
 )
 workspace.FeedBlob("logits", np.asarray([[.1, .4, .7, 1.5, .2]]).astype(np.float32))
 workspace.FeedBlob("labels", np.asarray([4]).astype(np.int32))
 print("logits:", workspace.FetchBlob("logits"))
 print("labels:", workspace.FetchBlob("labels"))
 workspace.RunOperatorOnce(op)
 print("softmax:", workspace.FetchBlob("softmax"))
 print("avgloss:", workspace.FetchBlob("avgloss"))
 ```
 **Result**
 ```
 logits: [[0.1 0.4 0.7 1.5 0.2]]
 labels: [4]
 softmax: [[0.10715417 0.144643   0.19524762 0.4345316  0.11842369]]
 avgloss: 10.667433
 ```
 </details>
 )DOC")
-    .Input(0, "logits", "Unscaled log probabilities")
+    .Arg("label_prob","*(type: int; default: 0)* Setting to 1 enables inputting labels as probability distribution.")
-    .Input(1, "labels", "Ground truth")
+    .Arg("axis","*(type: int; default: 1)* Axis of the inputs when coerced to 2D.")
    .Arg("scale","*(type: float)* Average loss output scaling factor (must be >= 0).")
    .Arg("order","*(type: string; default: 'NCHW')* Order of blob dimensions (only 'NCHW' is supported currently).")
    .Input(0, "logits", "*(type: Tensor`<float>`)* Input tensor.")
    .Input(1, "labels", "*(type: Tensor`<float>`)* Ground truth label tensor.")
    .Input(
        2,
        "weight_tensor",
-        "Optional blob to be used to weight the samples for the loss.")
+        "*(type: Tensor`<float>`)* [OPTIONAL] Blob used to weight the samples for the loss.")
-    .Output(0, "softmax", "Tensor with softmax cross entropy loss")
+    .Output(0, "softmax", "*(type: Tensor`<float>`)* Softmax output tensor.")
-    .Output(1, "loss", "Average loss");
+    .Output(1, "loss", "*(type: float)* Averaged cross-entropy loss output.");
 // Input: X, T, P, dY; Output: dX
 OPERATOR_SCHEMA(SoftmaxWithLossGradient).NumOutputs(1);
--- a/caffe2/operators/sqr_op.cc
+++ b/caffe2/operators/sqr_op.cc
@ -14,9 +14,55 @@ OPERATOR_SCHEMA(Sqr)
    .NumOutputs(1)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShape()
-    .SetDoc("Square (x^2) the elements of the input")
+    .SetDoc(R"DOC(
-    .Input(0, "input", "Input tensor")
+Performs element-wise squaring ($x^2$) of input tensor.
-    .Output(0, "output", "Squared elements of the input");
+
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sqr_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sqr",
    ["X"],
    ["Y"],
 )
 workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X:
 [[4. 6. 2.]
 [0. 1. 6.]
 [9. 2. 7.]]
 Y:
 [[16. 36.  4.]
 [ 0.  1. 36.]
 [81.  4. 49.]]
 ```
 </details>
    )DOC")
    .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
    .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
 namespace {
--- a/caffe2/operators/sqrt_op.cc
+++ b/caffe2/operators/sqrt_op.cc
@ -19,10 +19,53 @@ OPERATOR_SCHEMA(Sqrt)
    .AllowInplace({{0, 0}})
    .IdenticalTypeAndShape()
    .SetDoc(R"DOC(
-Computes the element-wise sqrt of the input.
+Performs element-wise square-root ($\sqrt{x}$) of input tensor $X$.
 Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/sqrt_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Sqrt",
    ["X"],
    ["Y"],
 )
 workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))).astype(np.float32))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("Y:", workspace.FetchBlob("Y"))
 ```
 **Result**
 ```
 X:
 [[8. 3. 3.]
 [4. 0. 0.]
 [1. 2. 5.]]
 Y:
 [[2.8284268  1.7320508  1.7320508 ]
 [1.9999999  0.         0.        ]
 [0.99999994 1.4142134  2.236068  ]]
 ```
 </details>
 )DOC")
-    .Input(0, "X", "ND input tensor")
+.Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor.")
-    .Output(0, "Y", "ND input tensor");
+.Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.");
 namespace {
--- a/caffe2/operators/transpose_op.cc
+++ b/caffe2/operators/transpose_op.cc
@ -49,16 +49,57 @@ OPERATOR_SCHEMA(Transpose)
      return out;
    })
    .SetDoc(R"DOC(
-Transpose the input tensor similar to numpy.transpose. For example, when
+Transpose the input tensor by permuting the axes of the input according
-axes=(1, 0, 2), given an input tensor of shape (1, 2, 3), the output shape
+to the `axes` argument. Similar to numpy's
-will be (2, 1, 3).
+[transpose](https://docs.scipy.org/doc/numpy/reference/generated/numpy.transpose.html)
 function.
 For example, when axes=(1, 0, 2), given an input tensor of shape
 (1, 2, 3), the output shape will be (2, 1, 3).
 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/transpose_op.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Transpose",
    ["X"],
    ["Y"],
    axes=(0,3,1,2)
 )
 x = np.random.rand(1,32,32,3)
 workspace.FeedBlob("X", x)
 print("X.shape (NHWC order):", workspace.FetchBlob("X").shape)
 workspace.RunOperatorOnce(op)
 print("Y.shape (NCHW order):", workspace.FetchBlob("Y").shape)
 ```
 **Result**
 ```
 X.shape (NHWC order): (1, 32, 32, 3)
 Y.shape (NCHW order): (1, 3, 32, 32)
 ```
 </details>
 )DOC")
    .Arg(
        "axes",
-        "A list of integers. By default, reverse the dimensions, "
+        "*(type: Tuple(int))* Order to permute axes of input tensor. Reverses "
-        "otherwise permute the axes according to the values given.")
+        "the dimensions by default.")
-    .Input(0, "data", "An input tensor.")
+    .Input(0, "X", "*(type: Tensor)* Input tensor.")
-    .Output(0, "transposed", "Transposed output.")
+    .Output(0, "Y", "*(type: Tensor)* Transposed output.")
    .InheritOnnxSchema("Transpose");
 class GetTransposeGradient : public GradientMakerBase {
--- a/caffe2/operators/utility_ops.cc
+++ b/caffe2/operators/utility_ops.cc
@ -1202,15 +1202,71 @@ OPERATOR_SCHEMA(NanCheck)
 OPERATOR_SCHEMA(Size)
    .NumInputs(1)
    .NumOutputs(1)
-    .SetDoc(
+    .SetDoc(R"DOC(
-        "Return a 1D tensor of type int64 that contains the number "
+Return a 1D tensor of type *int64* that contains the number of elements of the input tensor.
-        "of elements of the input tensor")
+
-    .Input(0, "tensor", "Tensor to calculate number of elements")
+Github Link:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/utility_ops.cc
 <details>
 <summary> <b>Example</b> </summary>
 **Code**
 ```
 workspace.ResetWorkspace()
 op = core.CreateOperator(
    "Size",
    ["X"],
    ["size"],
 )
 workspace.FeedBlob("X", (np.random.randint(10, size=(3,3))))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("size:", workspace.FetchBlob("size"))
 workspace.ResetWorkspace()
 workspace.FeedBlob("X", (np.random.rand(6,4)))
 print("X:", workspace.FetchBlob("X"))
 workspace.RunOperatorOnce(op)
 print("size:", workspace.FetchBlob("size"))
 ```
 **Result**
 ```
 X:
 [[3 7 0]
 [0 1 6]
 [5 0 8]]
 size: 9
 X:
 [[0.92017884 0.32115368 0.68692035 0.64135016]
 [0.8723328  0.77830265 0.80688656 0.25524236]
 [0.37970216 0.76407047 0.85689564 0.30692883]
 [0.69352573 0.42531502 0.16415212 0.59209324]
 [0.52684188 0.37094846 0.60670079 0.6489272 ]
 [0.94715906 0.34800557 0.61898769 0.28947359]]
 size: 24
 ```
 </details>
      )DOC")
    .Input(0, "X", "*(type: Tensor)* Input tensor to calculate number of elements.")
    .Output(
        0,
-        "output",
+        "size",
-        "1D tensor of type int64 that contains the number of "
+        "*(type: Tensor)* 1D tensor of type int64 that contains the number of "
-        "elements in the input tensor.");
+        "elements in the input tensor *X*.");
 REGISTER_CPU_OPERATOR(Size, SizeOp<CPUContext>);
 NO_GRADIENT(Size);