caffe2/python/hypothesis_test.py - platform/external/pytorch - Git at Google

 import copy
 import threading
 import time
 import unittest
 from functools import partial, reduce

 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
 from caffe2.proto import caffe2_pb2

 from caffe2.python import core, dyndep, tt_core, workspace
 from hypothesis import assume, given, HealthCheck, settings

 dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/optimizers:sgd_simd_ops')

 if workspace.has_gpu_support:
     # NOTE: During GPU stress tests, the number of workers exceeds the number
     #       of GPUs which results in flakiness from GPU contention. As a
     #       result, deadlines are not enforced on CUDA runs.
     _hypothesis_settings = settings

     def settings(**kwargs):
         if 'deadline' in kwargs:
             kwargs['deadline'] = None
             kwargs.setdefault('max_examples', 50)

         def wrapped(f):
             return _hypothesis_settings(**kwargs)(f)
         return wrapped


 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))


 @st.composite
 def _tensor_and_prefix(draw, dtype, elements, min_dim=1, max_dim=4, **kwargs):
     dims_ = draw(
         st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim))
     extra_ = draw(
         st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim))
     assume(len(dims_) + len(extra_) < max_dim)
     return (draw(hu.arrays(dims_ + extra_, dtype, elements)),
             draw(hu.arrays(extra_, dtype, elements)))


 def _tensor_and_indices(min_dim=1, max_dim=4, dtype=np.float32,
                         elements=None, **kwargs):
     """ generates a tensor and a list of indices of larger tensor of same dim"""
     data_dims_ = st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim)
     original_dim = st.integers(min_value=2, max_value=10)
     return st.tuples(data_dims_, original_dim).flatmap(lambda pair: st.tuples(
         st.just(pair[1]),  # original dimension
         hu.arrays(pair[0], dtype, elements),  # data tensor
         hu.arrays(pair[0][0], dtype=np.int64, elements=st.integers(
             min_value=0, max_value=pair[1] - 1)),
     ))


 _NUMPY_TYPE_TO_ENUM = {
     np.float32: core.DataType.FLOAT,
     np.int32: core.DataType.INT32,
     bool: core.DataType.BOOL,
     np.uint8: core.DataType.UINT8,
     np.int8: core.DataType.INT8,
     np.uint16: core.DataType.UINT16,
     np.int16: core.DataType.INT16,
     np.int64: core.DataType.INT64,
     np.float64: core.DataType.DOUBLE,
 }


 def _dtypes(dtypes=None):
     dtypes = dtypes if dtypes else [np.int32, np.int64, np.float32]
     return st.sampled_from(dtypes)


 def _test_binary(name, ref, filter_=None, gcs=hu.gcs,
                  test_gradient=False, allow_inplace=False, dtypes=_dtypes):
     @given(
         inputs=dtypes().flatmap(
             lambda dtype: hu.tensors(
                 n=2, dtype=dtype,
                 elements=hu.elements_of_type(dtype, filter_=filter_))),
         out=st.sampled_from(('Y', 'X1', 'X2') if allow_inplace else ('Y',)),
         **gcs)
     @settings(
         max_examples=20,
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much])
     def test_binary(self, inputs, out, gc, dc):
         op = core.CreateOperator(name, ["X1", "X2"], [out])
         X1, X2 = inputs
         self.assertDeviceChecks(dc, op, [X1, X2], [0])
         # We only do gradient check with float32 types.
         if test_gradient and X1.dtype == np.float32:
             self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
         self.assertReferenceChecks(gc, op, [X1, X2], ref)

     return test_binary


 def _test_binary_broadcast(name, ref, filter_=None,
                            gcs=hu.gcs, allow_inplace=False, dtypes=_dtypes):
     @given(
         inputs=dtypes().flatmap(lambda dtype: _tensor_and_prefix(
             dtype=dtype,
             elements=hu.elements_of_type(dtype, filter_=filter_))),
         in_place=(st.booleans() if allow_inplace else st.just(False)),
         **gcs)
     @settings(
         max_examples=3,
         deadline=100,
         suppress_health_check=[HealthCheck.filter_too_much])
     def test_binary_broadcast(self, inputs, in_place, gc, dc):
         op = core.CreateOperator(
             name, ["X1", "X2"], ["X1" if in_place else "Y"], broadcast=1)
         X1, X2 = inputs
         self.assertDeviceChecks(dc, op, [X1, X2], [0])

         def cast_ref(x, y):
             return (np.array(ref(x, y)[0], dtype=x.dtype), )

         # gradient not implemented yet
         # self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
         self.assertReferenceChecks(gc, op, [X1, X2], cast_ref)

     return test_binary_broadcast


 class TestOperators(hu.HypothesisTestCase):

     def test_comparison_ops(self):
         ops = {"LT": lambda x1, x2: [x1 < x2],
                "LE": lambda x1, x2: [x1 <= x2],
                "GT": lambda x1, x2: [x1 > x2],
                "GE": lambda x1, x2: [x1 >= x2]}
         for name, ref in ops.items():
             _test_binary(name, ref, gcs=hu.gcs_cpu_only)(self)
             _test_binary_broadcast(name, ref, gcs=hu.gcs_cpu_only)(self)

     @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
     @settings(deadline=10000)
     def test_sum(self, inputs, in_place, gc, dc):
         op = core.CreateOperator("Sum", ["X1", "X2"],
                                         ["Y" if not in_place else "X1"])
         X1, X2 = inputs
         self.assertDeviceChecks(dc, op, [X1, X2], [0])
         self.assertGradientChecks(gc, op, [X1, X2], 0, [0])

     @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_row_mul(self, inputs, gc, dc):
         op = core.CreateOperator("RowMul", ["X1", "X2"], ["Y"])
         X1, Xtmp = inputs
         X2 = Xtmp[:, 0]

         def ref(x, y):
             ret = np.zeros(shape=x.shape, dtype=x.dtype)
             for i in range(y.size):
                 ret[i, ] = x[i, ] * y[i]
             return [ret]

         self.assertDeviceChecks(dc, op, [X1, X2], [0])
         for i in range(2):
             self.assertGradientChecks(gc, op, [X1, X2], i, [0])
         self.assertReferenceChecks(gc, op, [X1, X2], ref)

     @given(inputs=hu.tensors(n=2), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_max(self, inputs, gc, dc):
         op = core.CreateOperator("Max", ["X1", "X2"], ["Y"])

         X1, X2 = inputs
         # Make X1 and X2 far from each other, since X1=X2 is not differentiable
         # and the step size of gradient checker is 0.05
         X1[np.logical_and(X1 >= X2 - 0.05, X1 <= X2)] -= 0.05
         X1[np.logical_and(X1 <= X2 + 0.05, X1 >= X2)] += 0.05
         self.assertDeviceChecks(dc, op, [X1, X2], [0])
         for i in range(2):
             self.assertGradientChecks(gc, op, [X1, X2], i, [0])

         def elementwise_max(X, Y):
             return [np.maximum(X, Y)]
         self.assertReferenceChecks(gc, op, [X1, X2], elementwise_max)

     def test_add(self):
         def not_overflow(x):
             if not isinstance(x, float):
                 return abs(x) < (1 << 30) - 1
             return True

         def ref(x, y):
             return (x + y, )
         _test_binary("Add", ref, filter_=not_overflow, test_gradient=True)(self)
         _test_binary_broadcast("Add", ref, filter_=not_overflow)(self)

     def test_sub(self):
         def ref(x, y):
             return (x - y, )
         # TODO(jiayq): enable gradient test when implemented.
         _test_binary("Sub", ref, test_gradient=True)(self)
         _test_binary_broadcast("Sub", ref)(self)

     def test_mul(self):
         def not_overflow(x):
             if not isinstance(x, float):
                 return abs(x) < (1 << 15) - 1
             return True

         def ref(x, y):
             return (x * y, )
         _test_binary("Mul", ref, filter_=not_overflow, test_gradient=True)(self)
         _test_binary_broadcast("Mul", ref, filter_=not_overflow)(self)

     @settings(suppress_health_check=[HealthCheck.too_slow])
     def test_div(self):
         def ref(x, y):
             return (x / y, )

         def non_zero(x):
             return abs(x) > 1e-2

         def div_dtypes():
             return st.sampled_from([np.float32, np.float64])

         _test_binary(
             "Div", ref, filter_=non_zero, test_gradient=True,
             dtypes=div_dtypes, gcs=hu.gcs_cpu_only
         )(self)
         _test_binary(
             "Div", ref, filter_=non_zero, test_gradient=False,
             dtypes=div_dtypes
         )(self)
         _test_binary_broadcast(
             "Div", ref, filter_=non_zero, dtypes=div_dtypes)(self)

     @given(X=hu.tensor(), in_place=st.booleans(), **hu.gcs)
     @settings(deadline=1000)
     def test_negative(self, X, in_place, gc, dc):
         op = core.CreateOperator("Negative", ["X"],
                                  ["Y" if not in_place else "X"])
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(X=hu.tensor(), **hu.gcs)
     @settings(deadline=1000)
     def test_tanh(self, X, gc, dc):
         op = core.CreateOperator("Tanh", "X", "Y")
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(X=hu.tensor(), **hu.gcs)
     @settings(deadline=10000)
     def test_averaged_loss(self, X, gc, dc):
         op = core.CreateOperator("AveragedLoss", ["X"], ["loss"])
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
     @settings(deadline=10000)
     def test_softsign(self, X, inplace, gc, dc):
         op = core.CreateOperator("Softsign", ["X"], ["X" if inplace else "Y"])

         def softsign(X):
             return (X / (1 + np.abs(X)),)

         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertReferenceChecks(gc, op, [X], softsign)
         if inplace:
             with self.assertRaises(Exception):
                 self.assertGradientChecks(gc, op, [X], 0, [0])
         else:
             self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(
         device_options=st.lists(
             min_size=2,
             max_size=4,
             elements=st.sampled_from(hu.expanded_device_options)),
         set_seed=st.booleans())
     @settings(deadline=10000)
     def test_random_seed_behaviour(self, device_options, set_seed):
         # Assume we are always operating on CUDA or CPU, since RNG is
         # inconsistent between CPU and GPU.
         device_options = copy.deepcopy(device_options)
         assume(len({do.device_type for do in device_options}) == 1)
         if set_seed:
             for do in device_options:
                 do.random_seed = 1000

         def run(do):
             # Reset each time because 'Y' may already exist in the workspace
             #   on a different device
             workspace.ResetWorkspace()
             ws = workspace.C.Workspace()
             op = core.CreateOperator(
                 "XavierFill", [], ["Y"],
                 device_option=do,
                 shape=[2])
             ws.run(op)
             return ws.blobs["Y"].fetch()

         ys = [run(do) for do in device_options]
         for y in ys[1:]:
             if set_seed:
                 np.testing.assert_array_equal(ys[0], y)
             else:
                 with self.assertRaises(AssertionError):
                     np.testing.assert_array_equal(ys[0], y)

     @given(axis=st.integers(min_value=1, max_value=4),
            num_output=st.integers(min_value=4, max_value=8),
            engine=st.sampled_from(["", "PACKED"]),
            **hu.gcs)
     @settings(deadline=10000)
     def test_fully_connected_axis(self, axis, num_output, engine, gc, dc):
         np.random.seed(1)
         X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)

         def prod(xs):
             p = 1
             for x in xs:
                 p *= x
             return p

         K = prod(list(X.shape)[axis:])
         N = num_output
         W = np.random.randn(N, K).astype(np.float32)
         b = np.random.randn(N).astype(np.float32)

         op = core.CreateOperator(
             "FC",
             ["X", "W", "b"],
             ["Y"],
             engine=engine,
             axis=axis)
         for name, param in [("X", X), ("W", W), ("b", b)]:
             self.ws.create_blob(name).feed(param)
         self.ws.run(op)
         Y = self.ws.blobs["Y"].fetch()
         self.assertEqual(list(Y.shape), list(X.shape)[:axis] + [N])

         inputs = [X, W, b]
         self.assertDeviceChecks(dc, op, inputs, [0])
         for param, _ in enumerate(inputs):
             self.assertGradientChecks(gc, op, inputs, param, [0])

     @unittest.skipIf(not workspace.has_gpu_support,
                      "Skipping test due to no gpu present.")
     @settings(deadline=None)
     @given(hidden_size=st.integers(min_value=1, max_value=3),
            num_layers=st.integers(min_value=1, max_value=3),
            bidirectional=st.booleans(),
            rnn_mode=st.sampled_from(["lstm"]),   # TODO: "gru"
            input_mode=st.sampled_from(["linear"]),
            dropout=hu.floats(min_value=1.0, max_value=1.0),
            T=st.integers(min_value=2, max_value=6),
            N=st.integers(min_value=1, max_value=4),
            D=st.integers(min_value=1, max_value=4))
     def test_recurrent(self, hidden_size, num_layers, bidirectional, rnn_mode,
                        input_mode, dropout, T, N, D):
         #there's a bug in miopen for N=1 which would be resolved in the next release.
         if workspace.has_hip_support:
             assume(N>1)
         # Random seed, this one happens to pass
         seed = 1234
         np.random.seed(seed)
         # set device option
         if workspace.has_hip_support:
             device_option = hu.hip_do
             engine = 'MIOPEN'
         else:
             device_option = hu.gpu_do
             engine = 'CUDNN'
         input_weight_size = hidden_size * D
         upper_layer_input_weight_size = hidden_size * hidden_size
         if bidirectional:
             upper_layer_input_weight_size *= 2
         recurrent_weight_size = hidden_size * hidden_size
         input_bias_size = hidden_size
         recurrent_bias_size = hidden_size
         num_directions = 2 if bidirectional else 1
         first_layer_sz = input_weight_size + recurrent_weight_size + \
                          input_bias_size + recurrent_bias_size
         upper_layer_sz = upper_layer_input_weight_size + \
                          recurrent_weight_size + input_bias_size + \
                          recurrent_bias_size
         total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz)
         total_sz *= num_directions

         W = np.random.rand(total_sz).astype(np.float32)
         self.ws.create_blob("WEIGHT").feed(W, device_option=device_option)

         op = core.CreateOperator(
             "Recurrent",
             ["INPUT", "HIDDEN_INPUT", "CELL_INPUT", "WEIGHT"],
             ["OUTPUT", "HIDDEN_OUTPUT", "CELL_OUTPUT",
              "RNN_SCRATCH", "DROPOUT_STATES"],
             hidden_size=hidden_size,
             bidirectional=bidirectional,
             rnn_mode=rnn_mode,
             dropout=dropout,
             input_mode=input_mode,
             num_layers=num_layers,
             seed=seed,
             engine=engine)
         X = np.random.randn(T, N, D).astype(np.float32)
         self.ws.create_blob("INPUT").feed(X, device_option=device_option)
         W = self.ws.blobs["WEIGHT"].fetch()
         H = np.random.randn(
             num_layers, N, hidden_size * num_directions).astype(
                 np.float32)
         C = np.random.randn(
             num_layers, N, hidden_size * num_directions).astype(
                 np.float32) if rnn_mode == "lstm" else \
             np.empty((1,)).astype(np.float32)  # unused in GRU
         inputs = [X, H, C, W]
         input_idxs = [i for (i, _) in enumerate(inputs)] \
             if rnn_mode == "lstm" else [0, 1, 3]  # ignore C
         for input_idx in input_idxs:
             self.assertGradientChecks(
                 device_option, op, inputs, input_idx, [0],
                 stepsize=0.01, threshold=0.01)

     @given(ndim=st.integers(1, 4),
            axis=st.integers(0, 3),
            add_axis=st.integers(0, 1),
            num_inputs=st.integers(2, 4), **hu.gcs)
     @settings(deadline=None, max_examples=50)
     def test_depth_concat(self, ndim, axis, add_axis, num_inputs, gc, dc):
         assume(axis < ndim)
         input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
         shape = [2, 3, 5, 7][:ndim]
         individual_dims = [1, 2, 3, 4, 5][:num_inputs]
         inputs = []
         for i in range(num_inputs):
             if add_axis == 0:
                 # Sets a unique dim and create the input.
                 shape[axis] = individual_dims[i]
             inputs.append(np.random.randn(*shape).astype(np.float32))
         op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
                                  axis=axis, add_axis=add_axis)
         self.assertDeviceChecks(dc, op, inputs, [0])
         for i in range(num_inputs):
             self.assertGradientChecks(gc, op, inputs, i, [0])

         # Reference
         def depth_concat(*inputs):
             inputs = list(inputs)
             if add_axis:
                 for i in range(len(inputs)):
                     inputs[i] = np.expand_dims(inputs[i], axis)
             input_dims = np.array([np.shape(x)[axis] for x in inputs])
             return [np.concatenate(inputs, axis=axis), input_dims]

         self.assertReferenceChecks(gc, op, inputs, depth_concat)

     @given(num_inputs=st.integers(2, 4),
            order=st.sampled_from([("NCHW", 1), ("NHWC", 3)]),
            **hu.gcs)
     @settings(deadline=10000)
     def test_depth_concat_with_order(self, num_inputs, order, gc, dc):
         input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
         shape = [2, 3, 5, 7]
         individual_dims = [1, 2, 3, 4][:num_inputs]
         inputs = []
         for i in range(num_inputs):
             # Sets a unique dim and create the input.
             shape[order[1]] = individual_dims[i]
             inputs.append(np.random.rand(*shape).astype(np.float32))
         op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
                                  order=order[0])
         self.assertDeviceChecks(dc, op, inputs, [0])
         for i in range(num_inputs):
             self.assertGradientChecks(gc, op, inputs, i, [0])

         # Reference
         def depth_concat_with_order(*inputs):
             inputs = list(inputs)
             axis = order[1]
             input_dims = np.array([np.shape(x)[axis] for x in inputs])
             return [np.concatenate(inputs, axis=axis), input_dims]

         self.assertReferenceChecks(gc, op, inputs, depth_concat_with_order)

     @given(X=hu.arrays(dims=[5, 2],
                        elements=hu.floats(
                            min_value=1.0,
                            max_value=10.0)
                        ),
            **hu.gcs_cpu_only)
     @settings(deadline=1000)
     def test_last_n_windows(self, X, gc, dc):
         workspace.FeedBlob('input', X)
         workspace.FeedBlob('next', np.array(0, dtype=np.int32))
         workspace.CreateBlob('output')
         collect_net = core.Net('collect_net')
         collect_net.LastNWindowCollector(
             ['output', 'next', 'input'],
             ['output', 'next'],
             num_to_collect=7,
         )
         plan = core.Plan('collect_data')
         plan.AddStep(core.execution_step('collect_data',
                                          [collect_net], num_iter=2))
         workspace.RunPlan(plan)
         output = workspace.FetchBlob('output')
         inputs = workspace.FetchBlob('input')
         new_output = np.zeros([7, inputs.shape[1]])
         for i in range(inputs.shape[0] * 2):
             new_output[i % 7] = inputs[i % inputs.shape[0]]
         import numpy.testing as npt
         npt.assert_almost_equal(output, new_output, decimal=5)

     @given(dtype=st.sampled_from([np.float32, np.float64, np.int32, bool]))
     @settings(deadline=1000)
     def test_print(self, dtype):
         data = np.random.permutation(6).astype(dtype)
         self.ws.create_blob("data").feed(data)
         op = core.CreateOperator("Print", "data", [])
         self.ws.run(op)

     @given(inputs=hu.tensors(n=2),
            in_place=st.booleans(),
            momentum=hu.floats(min_value=0.1, max_value=0.9),
            nesterov=st.booleans(),
            lr=hu.floats(min_value=0.1, max_value=0.9),
            **hu.gcs)
     @settings(deadline=10000)
     def test_momentum_sgd(
             self, inputs, in_place, momentum, nesterov, lr, gc, dc):
         grad, m = inputs
         lr = np.asarray([lr], dtype=np.float32)
         op = core.CreateOperator(
             "MomentumSGD",
             ["grad", "m", "lr"],
             ["grad" if in_place else "grad_o",
              "m" if in_place else "m_o"],
             momentum=momentum,
             nesterov=int(nesterov),
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [grad, m, lr], [0])

         # Reference
         def momentum_sgd(grad, m, lr):
             lr = lr[0]
             if not nesterov:
                 adjusted_gradient = lr * grad + momentum * m
                 return (adjusted_gradient, adjusted_gradient)
             else:
                 m_new = momentum * m + lr * grad
                 return ((1 + momentum) * m_new - momentum * m, m_new)

         self.assertReferenceChecks(gc, op, [grad, m, lr], momentum_sgd)

     @given(inputs=hu.tensors(n=3),
            in_place=st.booleans(),
            decay=hu.floats(min_value=0.1, max_value=0.9),
            momentum=hu.floats(min_value=0.1, max_value=0.9),
            lr=hu.floats(min_value=0.1, max_value=0.9),
            epsilon=hu.floats(min_value=1e-5, max_value=1e-2),
            **hu.gcs)
     @settings(deadline=10000)
     def test_rmsprop_sgd(self, inputs, in_place, decay, momentum, lr, epsilon,
                          gc, dc):
         grad, ms, mom = inputs
         ms = np.abs(ms) + 0.01
         lr = np.asarray([lr], dtype=np.float32)
         op = core.CreateOperator(
             "RmsProp",
             ["grad", "ms", "mom", "lr"],
             ["grad" if in_place else "grad_o",
              "ms" if in_place else "ms_o",
              "mom" if in_place else "mom_o"],
             momentum=momentum, decay=decay, epsilon=epsilon, device_option=gc)
         self.assertDeviceChecks(dc, op, [grad, ms, mom, lr], [0])

         def rmsprop(grad, ms, mom, lr):
             lr = lr[0]
             ms_o = ms + (1. - decay) * (np.square(grad) - ms)
             mom_o = momentum * mom + lr * grad / np.sqrt(epsilon + ms_o)
             grad_o = mom_o
             return (grad_o, ms_o, mom_o)
         self.assertReferenceChecks(gc, op, [grad, ms, mom, lr], rmsprop)

     # Reference
     @staticmethod
     def _dense_ftrl(alpha, beta, lambda1, lambda2, w, nz, g):
         if isinstance(alpha, np.ndarray):
             alpha = alpha.item()
         n = np.take(nz, 0, axis=-1)
         z = np.take(nz, 1, axis=-1)
         # python port of Sigrid's implementation
         g2 = g * g
         sigma = (np.sqrt(n + g2) - np.sqrt(n)) / alpha
         z += g - sigma * w
         n += g2
         w = (np.sign(z) * lambda1 - z) / (
             (beta + np.sqrt(n)) / alpha + lambda2)
         w[np.abs(z) <= lambda1] = 0
         return (w, np.stack([n, z], axis=-1))

     @given(inputs=hu.tensors(n=4),
            in_place=st.booleans(),
            alpha=hu.floats(min_value=0.01, max_value=0.1),
            beta=hu.floats(min_value=0.1, max_value=0.9),
            lambda1=hu.floats(min_value=0.001, max_value=0.1),
            lambda2=hu.floats(min_value=0.001, max_value=0.1),
            engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     @settings(deadline=1000)
     def test_ftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
                       engine, gc, dc):
         var, n, z, grad = inputs
         n = np.abs(n)
         nz = np.stack([n, z], axis=-1)
         op = core.CreateOperator(
             "Ftrl",
             ["var", "nz", "grad"],
             ["var" if in_place else "var_o",
              "nz" if in_place else "nz_o"],
             alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
             engine=engine,
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [var, nz, grad], [0])

         self.assertReferenceChecks(
             gc, op, [var, nz, grad],
             partial(self._dense_ftrl, alpha, beta, lambda1, lambda2))

     # Reference
     @staticmethod
     def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
         if isinstance(alpha, np.ndarray):
             alpha = alpha.item()

         old_shape = g.shape

         n = np.take(nz, 0, axis=-1)
         z = np.take(nz, 1, axis=-1)

         output_dim = g.shape[0]

         w = w.reshape(output_dim, -1)
         g = g.reshape(output_dim, -1)

         n = n.reshape(output_dim, -1)
         z = z.reshape(output_dim, -1)

         input_dim = g.shape[1]

         g2 = g * g
         sigma = (np.sqrt(n + g2) - np.sqrt(n)) / alpha
         z += g - sigma * w
         n += g2

         z_norms = np.linalg.norm(z, 2, axis=0)

         z_norms = z_norms + 1e-6
         w = z * ((lambda1 * np.sqrt(output_dim)) / z_norms - 1) / \
                     ((beta + np.sqrt(n)) / alpha + lambda2)
         for i in range(input_dim):
             if z_norms[i] <= lambda1 * np.sqrt(output_dim):
                 w[:, i] = 0

         w = w.reshape(old_shape)
         n = n.reshape(old_shape)
         z = z.reshape(old_shape)
         return (w, np.stack([n, z], axis=-1))

     @given(inputs=hu.tensors(n=4),
            in_place=st.booleans(),
            alpha=hu.floats(min_value=0.01, max_value=0.1),
            beta=hu.floats(min_value=0.1, max_value=0.9),
            lambda1=hu.floats(min_value=0.001, max_value=0.1),
            lambda2=hu.floats(min_value=0.001, max_value=0.1),
            engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
                        engine, gc, dc):
         var, n, z, grad = inputs
         n = np.abs(n)
         nz = np.stack([n, z], axis=-1)
         op = core.CreateOperator(
             "GFtrl",
             ["var", "nz", "grad"],
             ["var" if in_place else "var_o",
              "nz" if in_place else "nz_o"],
             alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
             engine=engine,
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [var, nz, grad], [0])

         self.assertReferenceChecks(
             gc, op, [var, nz, grad],
             partial(self._dense_gftrl, alpha, beta, lambda1, lambda2))

     @given(inputs=hu.tensors(n=4),
            alpha=hu.floats(min_value=0.01, max_value=0.1),
            beta=hu.floats(min_value=0.1, max_value=0.9),
            lambda1=hu.floats(min_value=0.001, max_value=0.1),
            lambda2=hu.floats(min_value=0.001, max_value=0.1),
            engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_sparse_ftrl_sgd(self, inputs, alpha, beta, lambda1, lambda2,
                              engine, gc, dc):
         var, n, z, grad = inputs
         # generate fake subset manually because hypothesis is too complicated :)
         indices = np.arange(var.shape[0])
         indices = indices[indices % 2 == 0]
         grad = grad[indices]
         n = np.abs(n)
         nz = np.stack([n, z], axis=-1)
         op = core.CreateOperator(
             "SparseFtrl",
             ["var", "nz", "indices", "grad"],
             ["var", "nz"],
             alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
             engine=engine,
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [var, nz, indices, grad], [0])

         # Reference
         def ftrl(w, nz, i, g):
             sw, snz = self._dense_ftrl(alpha, beta, lambda1, lambda2,
                                        w[i], nz[i], g)
             w[i] = sw
             nz[i] = snz
             return (w, nz)

         self.assertReferenceChecks(gc, op, [var, nz, indices, grad], ftrl)

     # Reference
     @staticmethod
     def _dense_ftrl_send_alpha_by_input(beta, lambda1, lambda2, w, nz, g, alpha):
         return TestOperators._dense_ftrl(alpha, beta, lambda1, lambda2, w, nz,
                                          g)

     @given(inputs=hu.tensors(n=4),
            in_place=st.booleans(),
            alpha=hu.floats(min_value=0.01, max_value=0.1),
            beta=hu.floats(min_value=0.1, max_value=0.9),
            lambda1=hu.floats(min_value=0.001, max_value=0.1),
            lambda2=hu.floats(min_value=0.001, max_value=0.1),
            engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_ftrl_sgd_send_alpha_by_input(self, inputs, in_place, alpha, beta,
                                           lambda1, lambda2, engine, gc, dc):
         var, n, z, grad = inputs
         n = np.abs(n)
         nz = np.stack([n, z], axis=-1)
         alpha = np.array(alpha).astype(np.float32)
         op = core.CreateOperator(
             "Ftrl",
             ["var", "nz", "grad", "alpha"],
             ["var" if in_place else "var_o",
              "nz" if in_place else "nz_o"],
             beta=beta, lambda1=lambda1, lambda2=lambda2,
             engine=engine,
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [var, nz, grad, alpha], [0])

         self.assertReferenceChecks(
             gc, op, [var, nz, grad, alpha],
             partial(self._dense_ftrl_send_alpha_by_input, beta, lambda1, lambda2))

     @given(inputs=hu.tensors(n=4),
            alpha=hu.floats(min_value=0.01, max_value=0.1),
            beta=hu.floats(min_value=0.1, max_value=0.9),
            lambda1=hu.floats(min_value=0.001, max_value=0.1),
            lambda2=hu.floats(min_value=0.001, max_value=0.1),
            engine=st.sampled_from([None, "SIMD"]),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_sparse_ftrl_sgd_send_alpha_by_input(self, inputs, alpha, beta,
                                                  lambda1, lambda2, engine, gc,
                                                  dc):
         var, n, z, grad = inputs
         # generate fake subset manually because hypothesis is too complicated :)
         indices = np.arange(var.shape[0])
         indices = indices[indices % 2 == 0]
         grad = grad[indices]
         n = np.abs(n)
         nz = np.stack([n, z], axis=-1)
         alpha = np.array(alpha).astype(np.float32)
         op = core.CreateOperator(
             "SparseFtrl",
             ["var", "nz", "indices", "grad", "alpha"],
             ["var", "nz"],
             beta=beta, lambda1=lambda1, lambda2=lambda2,
             engine=engine,
             device_option=gc)
         self.assertDeviceChecks(
             dc, op, [var, nz, indices, grad, alpha], [0])

         # Reference
         def ftrl(w, nz, i, g, alpha):
             sw, snz = self._dense_ftrl_send_alpha_by_input(beta, lambda1,
                                                            lambda2, w[i], nz[i],
                                                            g, alpha)
             w[i] = sw
             nz[i] = snz
             return (w, nz)

         self.assertReferenceChecks(gc, op, [var, nz, indices, grad, alpha],
                                    ftrl)

     @given(input=hu.tensor(max_value=20,
                            max_dim=1,
                            dtype=np.int32,
                            elements=st.integers(min_value=0, max_value=10)),
            with_remapping=st.booleans(),
            **hu.gcs_no_hip)
     @settings(deadline=10000)
     def test_unique(self, input, with_remapping, gc, dc):
         op = core.CreateOperator(
             "Unique",
             ["input"],
             ["unique"] + (["remapping"] if with_remapping else []),
             device_option=gc)
         self.assertDeviceChecks(dc, op, [input], [0])

         # Validator
         def unique_valid(input, unique, remapping=None):
             self.assertEqual(unique.size, len(set(input)))
             self.assertEqual(sorted(unique), sorted(set(input)))
             if with_remapping:
                 self.assertEqual(remapping.shape, input.shape)
                 remapped = [unique[remapping[i]] for i in range(len(input))]
                 np.testing.assert_array_equal(remapped, input)

         self.assertValidationChecks(gc, op, [input], unique_valid)

     @given(prediction=hu.arrays(dims=[10, 3],
                                 elements=hu.floats(allow_nan=False,
                                                    allow_infinity=False,
                                                    min_value=0,
                                                    max_value=1)),
            labels=hu.arrays(dims=[10],
                             dtype=np.int32,
                             elements=st.integers(min_value=0,
                                                  max_value=3 - 1)),
            top_k=st.integers(min_value=1, max_value=3),
            **hu.gcs)
     @settings(deadline=1000)
     def test_accuracy(self, prediction, labels, top_k, gc, dc):
         if(top_k > 1):
             gc = hu.cpu_do

         op = core.CreateOperator(
             "Accuracy",
             ["prediction", "labels"],
             ["accuracy"],
             top_k=top_k,
             device_option=gc
         )

         def op_ref(prediction, labels, top_k):
             N = prediction.shape[0]
             correct = 0
             for i in range(0, len(prediction)):
                 pred_sorted = sorted(
                     ([item, j] for j, item in enumerate(prediction[i])),
                     key=lambda x: x[0],
                     reverse=True
                 )
                 max_ids = [x[1] for x in pred_sorted[0:top_k]]
                 for m in max_ids:
                     if m == labels[i]:
                         correct += 1
             accuracy = correct / N
             return (accuracy,)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[prediction, labels, top_k],
             reference=op_ref)

     @given(target_probabilities=hu.arrays(
         dims=[10], elements=hu.floats(allow_nan=False,
                                       allow_infinity=False,
                                       min_value=0.01,
                                       max_value=1)),
            **hu.gcs)
     @settings(deadline=1000)
     def test_perplexity(self, target_probabilities, gc, dc):
         op = core.CreateOperator(
             "Perplexity",
             ["target_probabilities"],
             ["perplexity"]
         )

         def op_ref(target_probabilities):
             N = target_probabilities.shape[0]
             perplexities = np.power(target_probabilities, -1.0 / N)
             perplexity = reduce(lambda x, y: x * y, perplexities)
             return (perplexity,)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[target_probabilities],
             reference=op_ref)

     @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
                             min_size=0,
                             max_size=10),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_lengths_to_segment_ids(self, lengths, gc, dc):
         op = core.CreateOperator(
             "LengthsToSegmentIds",
             ["lengths"],
             ["segment_ids"])

         def op_ref(lengths):
             sids = []
             for i, l in enumerate(lengths):
                 sids.extend(l * [i])
             return (np.array(sids, dtype=np.int32), )

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref)

     @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
                             min_size=0,
                             max_size=10),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_lengths_range_fill(self, lengths, gc, dc):
         op = core.CreateOperator(
             "LengthsRangeFill",
             ["lengths"],
             ["increasing_seq"])

         def op_ref(lengths):
             sids = []
             for _, l in enumerate(lengths):
                 sids.extend(list(range(l)))
             return (np.array(sids, dtype=np.int32), )

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref)

     @given(**hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_segment_ids_to_ranges(self, gc, dc):
         lengths = [4, 6, 3, 2, 0, 4]
         op = core.CreateOperator(
             "SegmentIdsToRanges",
             ["segment_ids"],
             ["ranges"])

         def op_ref(segment_ids):
             ranges = [np.array([0, 0], dtype=np.int32)]
             prev = 0
             for i, sid in enumerate(segment_ids):
                 while sid != prev:
                     prev += 1
                     ranges.append(np.array([i, 0], dtype=np.int32))
                 ranges[-1][1] += 1
             return (np.array(ranges, dtype=np.int32), )

         def lengths_to_segment_ids(lengths):
             sids = []
             for i, l in enumerate(lengths):
                 sids.extend(l * [i])
             return (np.array(sids, dtype=np.int32), )

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=np.array(lengths_to_segment_ids(lengths), dtype=np.int32),
             reference=op_ref)

     @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
                             min_size=0,
                             max_size=10),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_lengths_to_ranges(self, lengths, gc, dc):
         op = core.CreateOperator(
             "LengthsToRanges",
             ["lengths"],
             ["ranges"])

         def op_ref(x):
             if not x.size:
                 return (x.reshape((0, 2)), )
             return (np.column_stack((np.concatenate(([0], np.cumsum(x)[:-1])),
                                      x)), )

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref)

     @given(
         lengths=st.lists(
             st.integers(min_value=0, max_value=10), min_size=0, max_size=10
         ),
         include_last_offset=st.booleans(),
         **hu.gcs_cpu_only
     )
     @settings(deadline=None)
     def test_lengths_to_offsets(self, lengths, include_last_offset, gc, dc):
         op = core.CreateOperator(
             "LengthsToOffsets",
             ["lengths"],
             ["ranges"],
             include_last_offset=include_last_offset,
         )

         def op_ref(x):
             if not x.size:
                 arr = [x.reshape(0)]
             else:
                 arr = [np.concatenate(([0], np.cumsum(x)[:-1]))]
             if include_last_offset:
                 arr[0] = np.concatenate((arr[0], np.array([np.sum(x)])))
             return tuple(arr)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[np.array(lengths, dtype=np.int32)],
             reference=op_ref,
         )

     @given(prediction=hu.arrays(dims=[10, 3],
                                 elements=hu.floats(allow_nan=False,
                                                    allow_infinity=False,
                                                    min_value=0,
                                                    max_value=1)),
            labels=hu.arrays(dims=[10],
                             dtype=np.int32,
                             elements=st.integers(min_value=0,
                                                  max_value=3 - 1)),
            **hu.gcs)
     @settings(deadline=10000)
     def test_multi_class_accuracy(self, prediction, labels, gc, dc):
         op = core.CreateOperator(
             "MultiClassAccuracy",
             ["prediction", "labels"],
             ["accuracies", "amounts"]
         )

         def op_ref(prediction, labels):
             N = prediction.shape[0]
             D = prediction.shape[1]
             accuracies = np.empty(D, dtype=float)
             accuracies.fill(0)
             amounts = np.empty(D, dtype=int)
             amounts.fill(0)
             max_ids = np.argmax(prediction, axis=1)
             for i in range(0, N):
                 max_id = max_ids[i]
                 label_id = labels[i]
                 if max_id == label_id:
                     accuracies[label_id] += 1
                 amounts[label_id] += 1
             for i in range(0, D):
                 amount = amounts[i]
                 if amount:
                     accuracies[i] /= amount
             return (accuracies, amounts,)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[prediction, labels],
             reference=op_ref)

     @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
                             min_size=0,
                             max_size=10),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_segment_ids_to_lengths(self, lengths, gc, dc):
         op = core.CreateOperator(
             "SegmentIdsToLengths",
             ["segment_ids"],
             ["lengths"])

         def lengths_to_ids(lengths):
             sids = []
             for i, l in enumerate(lengths):
                 sids.extend(l * [i])
             return sids

         segment_ids = lengths_to_ids(lengths)

         def ids_to_lengths(ids):
             ids_length = len(ids)
             if ids_length == 0:
                 return (np.array([], dtype=np.int32),)

             lengths = []
             # segment id starts with 0
             prev_id = -1
             tmp_length = 0
             for idx in range(ids_length):
                 cur_id = ids[idx]
                 if cur_id != prev_id:
                     if idx != 0:
                         lengths.append(tmp_length)
                     while prev_id + 1 != cur_id:
                         lengths.append(0)
                         prev_id += 1
                     prev_id = cur_id
                     tmp_length = 0
                 tmp_length += 1
             lengths.append(tmp_length)
             return (np.array(lengths, dtype=np.int32),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[np.array(segment_ids, dtype=np.int32)],
             reference=ids_to_lengths)

     @given(lengths=st.lists(st.integers(min_value=1, max_value=10),
                             min_size=0,
                             max_size=10),
            power=st.sampled_from([0.5, 1.0, 1.5, 2.0]),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_lengths_to_weights(self, lengths, power, gc, dc):
         op = core.CreateOperator(
             "LengthsToWeights",
             ["lengths"],
             ["weights"],
             power=power)

         def lengths_to_weights(lengths):
             weighted_length = []
             for l in lengths:
                 weighted_length.extend(l * [1 / pow(l, power)])

             return (np.array(weighted_length, dtype=float),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[np.array(lengths, dtype=np.int32)],
             reference=lengths_to_weights)

     @given(input_tensor=hu.arrays(
         dims=[10], elements=hu.floats(allow_nan=False,
                                       allow_infinity=False)),
            **hu.gcs)
     @settings(deadline=10000)
     def test_abs(self, input_tensor, gc, dc):
         op = core.CreateOperator(
             "Abs",
             ["input"],
             ["output"]
         )

         def abs_ref(input_tensor):
             return (np.abs(input_tensor),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[input_tensor],
             reference=abs_ref)

     @given(input_tensor=hu.arrays(
         dims=[10], elements=hu.floats(min_value=-10,
                                       max_value=10)),
            **hu.gcs)
     @settings(deadline=10000)
     def test_cos(self, input_tensor, gc, dc):
         op = core.CreateOperator(
             "Cos",
             ["input"],
             ["output"]
         )

         def cos_ref(input_tensor):
             return (np.cos(input_tensor),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[input_tensor],
             reference=cos_ref)

     @given(input_tensor=hu.arrays(
            dims=[10], elements=hu.floats(min_value=-10,
                                          max_value=10)),
            **hu.gcs)
     @settings(deadline=1000)
     def test_sin(self, input_tensor, gc, dc):
         op = core.CreateOperator(
             "Sin",
             ["input"],
             ["output"]
         )

         def sin_ref(input_tensor):
             return (np.sin(input_tensor),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[input_tensor],
             reference=sin_ref)

     @given(input_tensor=hu.arrays(
            dims=[10], elements=hu.floats(allow_nan=False,
                                          allow_infinity=False)),
            **hu.gcs)
     @settings(deadline=10000)
     def test_exp(self, input_tensor, gc, dc):
         op = core.CreateOperator(
             "Exp",
             ["input"],
             ["output"]
         )

         def exp_ref(input_tensor):
             return (np.exp(input_tensor),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[input_tensor],
             reference=exp_ref)

     @given(input_tensor=hu.arrays(
         dims=[10], elements=hu.floats(min_value=1,
                                       max_value=10000)),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_log(self, input_tensor, gc, dc):
         op = core.CreateOperator(
             "Log",
             ["input"],
             ["output"]
         )

         def log_ref(input_tensor):
             return (np.log(input_tensor),)

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[input_tensor],
             reference=log_ref)
         self.assertGradientChecks(gc, op, [input_tensor], 0, [0])

     def test_blobs_dequeue_timeout(self):
         op = core.CreateOperator(
             "CreateBlobsQueue",
             [],
             ["queue"],
             capacity=5,
             num_blobs=1)
         self.ws.run(op)
         t = time.time()
         op = core.CreateOperator(
             "DequeueBlobs",
             ["queue"],
             ["out"],
             timeout_secs=0.2)
         self.assertRaises(RuntimeError, lambda: self.ws.run(op))
         t = time.time() - t
         self.assertGreater(t, 0.19)

     @given(num_threads=st.integers(1, 10),  # noqa
            num_elements=st.integers(1, 100),
            capacity=st.integers(1, 5),
            num_blobs=st.integers(1, 3),
            do=st.sampled_from(hu.device_options))
     @settings(deadline=10000)
     def test_blobs_queue_threading(self, num_threads, num_elements,
                                    capacity, num_blobs, do):
         """
         - Construct matrices of size N x D
         - Start K threads
         - Push all N rows into the queue of capacity C
         - Pull all N rows out of the queue.
         - Verify that the output matrices are permutation of the rows of the
           original matrices.
         """
         import threading
         import queue
         op = core.CreateOperator(
             "CreateBlobsQueue",
             [],
             ["queue"],
             capacity=capacity,
             num_blobs=num_blobs,
             device_option=do)
         self.ws.run(op)

         xs = [np.random.randn(num_elements, 5).astype(np.float32)
               for _ in range(num_blobs)]
         q = queue.Queue()
         for i in range(num_elements):
             q.put([x[i] for x in xs])

         def enqueue(t):
             while True:
                 feed_blobs = ["x_{}_{}".format(i, t) for i in range(num_blobs)]
                 op = core.CreateOperator(
                     "EnqueueBlobs",
                     ["queue"] + feed_blobs,
                     feed_blobs,
                     device_option=do)
                 try:
                     elems = q.get_nowait()
                     for elem, feed_blob in zip(elems, feed_blobs):
                         self.ws.create_blob(feed_blob).feed(
                             elem, device_option=do)
                     self.ws.run(op)
                 except queue.Empty:
                     return

         # Create all blobs before racing on multiple threads
         # (blob creation is not threadsafe)
         for t in range(num_threads):
             for i in range(num_blobs):
                 self.ws.create_blob("x_{}_{}".format(i, t))

         threads = [threading.Thread(target=enqueue, args=(t,))
                    for t in range(num_threads)]
         for thread in threads:
             thread.start()

         for n in range(num_elements):
             dequeue_blobs = ["y_{}_{}".format(i, n) for i in range(num_blobs)]
             op = core.CreateOperator(
                 "DequeueBlobs",
                 ["queue"],
                 dequeue_blobs,
                 device_option=do)
             self.ws.run(op)
         for thread in threads:
             thread.join()
         op = core.CreateOperator("CloseBlobsQueue", ["queue"], [])
         self.ws.run(op)
         ys = [np.vstack([self.ws.blobs["y_{}_{}".format(i, n)].fetch()
                          for n in range(num_elements)])
               for i in range(num_blobs)]
         for i in range(num_blobs):
             self.assertEqual(ys[i].shape, xs[i].shape)
             for j in range(num_elements):
                 # Verify that the rows of the returned blob are a
                 # permutation. The order may be different due to
                 # different threads racing.
                 self.assertTrue(
                     any(np.array_equal(xs[i][j], ys[i][k])
                         for k in range(num_elements)))

     @given(num_producers=st.integers(1, 10),
            num_consumers=st.integers(1, 10),
            capacity=st.integers(1, 5),
            num_blobs=st.integers(1, 3),
            do=st.sampled_from(hu.device_options))
     @settings(deadline=None, max_examples=50)
     def test_safe_blobs_queue(self, num_producers, num_consumers,
                               capacity, num_blobs, do):
         init_net = core.Net('init_net')
         queue = init_net.CreateBlobsQueue(
             [], 1, capacity=capacity, num_blobs=num_blobs)
         producer_steps = []
         truth = 0
         for i in range(num_producers):
             name = 'producer_%d' % i
             net = core.Net(name)
             blobs = [net.ConstantFill([], 1, value=1.0, run_once=False)
                      for times in range(num_blobs)]
             status = net.NextName()
             net.SafeEnqueueBlobs([queue] + blobs, blobs + [status])
             count = (i + 1) * 10
             step = core.execution_step(name, net, num_iter=count)
             truth += count
             producer_steps.append(step)
         producer_exit_net = core.Net('producer_exit_net')
         producer_exit_net.CloseBlobsQueue([queue], 0)
         producer_step = core.execution_step('producer', [
             core.execution_step(
                 'producers', producer_steps, concurrent_substeps=True),
             core.execution_step('producer_exit', producer_exit_net)]
         )

         consumer_steps = []
         counters = []
         const_1 = init_net.ConstantFill([], 1, value=1.0)
         for i in range(num_consumers):
             name = 'consumer_%d' % i
             net1 = core.Net(name)
             blobs = net1.SafeDequeueBlobs([queue], num_blobs + 1)
             status = blobs[-1]

             net2 = core.Net(name + '_counter')
             counter = init_net.ConstantFill([], 1, value=0.0)
             counters.append(counter)
             net2.Add([counter, const_1], counter)
             consumer_steps.append(core.execution_step(
                 name, [net1, net2], should_stop_blob=status))
         consumer_step = core.execution_step(
             'consumer', consumer_steps, concurrent_substeps=True)

         init_step = core.execution_step('init', init_net)
         worker_step = core.execution_step(
             'worker', [consumer_step, producer_step], concurrent_substeps=True)

         plan = core.Plan('test')
         plan.AddStep(init_step)
         plan.AddStep(worker_step)

         self.ws.run(plan)
         v = 0
         for counter in counters:
             v += self.ws.blobs[str(counter)].fetch().tolist()
         self.assertEqual(v, truth)

     @given(num_queues=st.integers(1, 5),
            num_iter=st.integers(5, 10),
            capacity=st.integers(1, 5),
            num_blobs=st.integers(1, 3))
     @settings(deadline=None, max_examples=50)
     def test_weighted_sample_blobs_queue(
         self, num_queues, num_iter, capacity, num_blobs
     ):
         # Create BlobsQueue for each input queue
         print("num_queues", num_queues)
         init_net = core.Net('init_net')
         queues = [
             init_net.CreateBlobsQueue(
                 [], 1, capacity=capacity, num_blobs=num_blobs
             ) for _ in range(num_queues)
         ]

         # Create multiple producer nets and one producer exist net
         producer_steps = []
         producer_exit_nets = []
         for i in range(num_queues):
             name = 'producer_%d' % i
             net = core.Net(name)
             blobs = [net.ConstantFill([], 1, value=1.0, run_once=False)
                      for _ in range(num_blobs)]
             status = net.NextName()
             net.SafeEnqueueBlobs([queues[i]] + blobs, blobs + [status])

             exit_net = core.Net('producer_exit_%d' % i)
             exit_net.CloseBlobsQueue(queues[i], 0)
             producer_exit_nets.append(exit_net)

             step = core.execution_step(
                 name, [
                     core.execution_step(
                         'producer_%d' % i, [net], num_iter=num_iter
                     ),
                     core.execution_step('producer_exit_%d' % i, [exit_net]),
                 ]
             )
             producer_steps.append(step)

         producer_step = core.execution_step(
             'producer', [
                 core.execution_step(
                     'producers',
                     producer_steps,
                     concurrent_substeps=True,
                 ),
             ]
         )

         status_lst = []

         def append(ins, outs):
             status_lst.append(ins)

         # Create one consumer dequeue net and one consumer exist net
         consumer_net = core.Net('weight_sample_dequeue_net')
         table_idx_blob = np.random.randint(low=-1, high=num_blobs, size=1)
         blobs = consumer_net.WeightedSampleDequeueBlobs(
             queues,
             num_blobs + 1,
             weights=np.random.uniform(low=0.0, high=1.0, size=(num_queues,)),
             table_idx_blob=table_idx_blob[0],
         )
         status = blobs[-1]
         consumer_net.Python(append)(status)

         consumer_step = core.execution_step(
             'consumer',
             [
                 core.execution_step(
                     'consumer', [consumer_net], should_stop_blob=status
                 ),
                 core.execution_step('producer_exit', producer_exit_nets)
             ]
         )

         init_step = core.execution_step('init', init_net)
         worker_step = core.execution_step(
             'worker', [producer_step, consumer_step], concurrent_substeps=True)

         plan = core.Plan('test')
         plan.AddStep(init_step)
         plan.AddStep(worker_step)

         self.ws.run(plan)
         assert len(status_lst) >= num_iter + 1
         assert len(status_lst) <= num_iter * num_queues + 1

     @given(
         data=hu.tensor(),
         **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_squeeze_expand_dims(self, data, gc, dc):
         dims = [0, 0]
         if len(data.shape) > 2:
             dims.append(2)
         op = core.CreateOperator(
             "ExpandDims",
             ["data"],
             ["expanded"],
             dims=dims)

         def expand_dims_ref(data, *args, **kw):
             inc_dims = list(set(dims))
             inc_dims.sort()
             r = data
             for dim in inc_dims:
                 r = np.expand_dims(r, axis=dim)
             return (r, )

         def squeeze_ref(data, *args, **kw):
             dec_dims = list(set(dims))
             dec_dims.sort(reverse=True)
             r = data
             for dim in dec_dims:
                 r = np.squeeze(r, axis=dim)
             return (r, )

         self.assertReferenceChecks(
             device_option=gc,
             op=op,
             inputs=[data],
             reference=expand_dims_ref,
             output_to_grad='expanded',
             grad_reference=squeeze_ref)

     @given(**hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_tt_layer(self, gc, dc):
         seed = 1234
         np.random.seed(seed)

         inp_sizes = [2, 2, 2, 2]
         out_sizes = [2, 2, 2, 2]
         tt_ranks = [1, 3, 3, 3, 1]

         op = core.CreateOperator(
             "TT",
             ["X", "b", "cores"],
             ["Y"],
             inp_sizes=inp_sizes,
             out_sizes=out_sizes,
             tt_ranks=tt_ranks,
         )

         X = np.expand_dims(
             np.random.rand(16).astype(np.float32), axis=0)
         b = np.array([0] * 16).astype(np.float32)
         cores = tt_core.init_tt_cores(inp_sizes, out_sizes, tt_ranks)

         self.ws.create_blob("X").feed(X)
         self.ws.create_blob("b").feed(b)
         self.ws.create_blob("cores").feed(cores)
         self.ws.run(op)

         Y = self.ws.blobs[("Y")].fetch()
         Y = Y.reshape([16])

         golden = np.array([-9.51763490e-07, -1.28442286e-06,
                            -2.86281141e-07, 2.28865644e-07,
                            -1.96180017e-06, -1.78920531e-06,
                            9.31094666e-07, -2.04273989e-07,
                            1.70017107e-06, 1.64845711e-06,
                            -1.06099132e-06, -4.69111137e-07,
                            6.57552358e-08, -1.28942040e-08,
                            -2.29114004e-07, -1.04262714e-06])

         # This golden array is dependent on the specified inp_sizes, out_sizes,
         # tt_ranks, and seed. Changing these will cause the test to fail.
         self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=1e-10)

     @given(**hu.gcs_cpu_only)
     def test_tt_sls_layer(self, gc, dc):
         seed = 1234
         np.random.seed(seed)

         factor_voc = [10, 10, 10]
         factor_width = [2, 2, 2]

         op = core.CreateOperator(
             "TTSparseLengthsSum",
             ["core0", "core1", "core2", "index", "lengths"],
             ["Y", "core0_output", "core1_output", "indices"],
             factor_i=factor_voc,
             factor_j=factor_width,
             ranks=[1, 16, 16, 1],
             emb_size=8
         )
         c0 = np.ones([10, 1, 2, 16]).astype(np.float32)
         c1 = np.ones([10, 16, 2, 16]).astype(np.float32)
         c2 = np.ones([10, 16, 2, 1]).astype(np.float32)
         # index = np.array([0, 1, 2, 1, 4], dtype=int)
         # lengths = np.array([3, 2], dtype=int)
         index = np.array([0, 1, 2, 1, 4], np.int64)
         lengths = np.array([3, 2], np.int32)

         self.ws.create_blob("core0").feed(c0)
         self.ws.create_blob("core1").feed(c1)
         self.ws.create_blob("core2").feed(c2)
         self.ws.create_blob("index").feed(index)
         self.ws.create_blob("lengths").feed(lengths)

         self.ws.run(op)
         Y = self.ws.blobs[("Y")].fetch()
         self.assertEqual(list(Y.shape), [2, 8])

         golden = np.array([[768, 768, 768, 768, 768, 768, 768, 768],
                            [512, 512, 512, 512, 512, 512, 512, 512]])

         self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=0)

     @given(**hu.gcs_cpu_only)
     def test_tt_sls_gradientop(self, gc, dc):

         op = core.CreateOperator(
             "TTSparseLengthsSumGradient",
             ["core0", "core1", "core2", "lengths",
              "core0_out", "core1_out", "indices", "dY"],
             ["dCore0", "dCore1", "dCore2"]
         )

         c0 = np.ones([10, 1, 4, 16]).astype(np.float32)
         c1 = np.ones([10, 16, 4, 16]).astype(np.float32)
         c2 = np.ones([10, 16, 4, 1]).astype(np.float32)
         lengths = np.array([3, 2], np.int32)

         c0_out = np.ones([5, 4, 16]).astype(np.float32)
         c1_out = np.ones([5, 16, 16]).astype(np.float32)

         indices = np.array([[0, 0, 0],
                             [1, 0, 0],
                             [2, 0, 0],
                             [1, 0, 0],
                             [4, 0, 0]], np.int64)

         dY = np.ones([2, 64]).astype(np.float32)

         self.ws.create_blob("core0").feed(c0)
         self.ws.create_blob("core1").feed(c1)
         self.ws.create_blob("core2").feed(c2)
         self.ws.create_blob("lengths").feed(lengths)
         self.ws.create_blob("core0_out").feed(c0_out)
         self.ws.create_blob("core1_out").feed(c1_out)
         self.ws.create_blob("indices").feed(indices)
         self.ws.create_blob("dY").feed(dY)

         self.ws.run(op)
         dCore0 = self.ws.blobs[("dCore0")].fetch()
         dCore1 = self.ws.blobs[("dCore1")].fetch()
         dCore2 = self.ws.blobs[("dCore2")].fetch()
         self.assertEqual(list(dCore0.shape), list(c0.shape))
         self.assertEqual(list(dCore1.shape), list(c1.shape))
         self.assertEqual(list(dCore2.shape), list(c2.shape))


     @given(**hu.gcs_cpu_only)
     def test_tt_sls_gradientop1(self, gc, dc):

         op = core.CreateOperator(
             "TTSparseLengthsSumGradient",
             ["core0", "core1", "core2", "lengths",
              "core0_out", "core1_out", "indices", "dY"],
             ["dCore0", "dCore1", "dCore2"]
         )

         c0 = np.ones([101, 1, 2, 16]).astype(np.float32)
         c1 = np.ones([102, 16, 2, 16]).astype(np.float32)
         c2 = np.ones([153, 16, 4, 1]).astype(np.float32)
         lengths = np.array([3, 2], np.int32)

         c0_out = np.ones([5, 2, 16]).astype(np.float32)
         c1_out = np.ones([5, 4, 16]).astype(np.float32)

         indices = np.array([[0, 0, 0],
                             [1, 0, 0],
                             [2, 0, 0],
                             [1, 0, 0],
                             [4, 0, 0]], np.int64)

         dY = np.ones([2, 16]).astype(np.float32)

         self.ws.create_blob("core0").feed(c0)
         self.ws.create_blob("core1").feed(c1)
         self.ws.create_blob("core2").feed(c2)
         self.ws.create_blob("lengths").feed(lengths)
         self.ws.create_blob("core0_out").feed(c0_out)
         self.ws.create_blob("core1_out").feed(c1_out)
         self.ws.create_blob("indices").feed(indices)
         self.ws.create_blob("dY").feed(dY)

         self.ws.run(op)
         dCore0 = self.ws.blobs[("dCore0")].fetch()
         dCore1 = self.ws.blobs[("dCore1")].fetch()
         dCore2 = self.ws.blobs[("dCore2")].fetch()
         self.assertEqual(list(dCore0.shape), list(c0.shape))
         self.assertEqual(list(dCore1.shape), list(c1.shape))
         self.assertEqual(list(dCore2.shape), list(c2.shape))

     @given(**hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_tt_sls(self, gc, dc):
         factor_voc = [10, 10, 10]
         factor_width = [2, 2, 2]

         op = core.CreateOperator(
             "TTSparseLengthsSum",
             ["core0", "core1", "core2", "index", "lengths"],
             ["Y", "core0_output", "core1_output", "indices"],
             factor_i=factor_voc,
             factor_j=factor_width,
             ranks=[1, 16, 16, 1],
             emb_size=8
         )
         c0 = np.ones([10, 1, 2, 16]).astype(np.float32)
         c1 = np.ones([10, 16, 2, 16]).astype(np.float32)
         c2 = np.ones([10, 16, 2, 1]).astype(np.float32)
         index = np.array([0, 1, 2, 1, 4], np.int64)
         lengths = np.array([0, 3, 0, 0, 2, 0, 0], np.int32)
         self.assertGradientChecks(gc, op, [c0, c1, c2, index, lengths], 0, [0])


     @given(**hu.gcs_cpu_only)
     def test_tt_sls_repro(self, gc, dc):
         factor_voc = [125, 160, 200]
         factor_width = [4, 4, 4]

         op = core.CreateOperator(
             "TTSparseLengthsSum",
             ["core0", "core1", "core2", "index", "lengths"],
             ["Y", "core0_output", "core1_output", "indices"],
             factor_i=factor_voc,
             factor_j=factor_width,
             ranks=[1, 16, 16, 1],
             emb_size=64
         )
         c0 = np.ones([125, 1, 4, 16]).astype(np.float32)
         c1 = np.ones([160, 16, 4, 16]).astype(np.float32)
         c2 = np.ones([200, 16, 4, 1]).astype(np.float32)
         index = np.array([0, 4000000 - 1, 20000, 1000000, 4000000 - 1], np.int64)
         lengths = np.array([0, 3, 0, 0, 2, 0, 0], np.int32)

         self.ws.create_blob("core0").feed(c0)
         self.ws.create_blob("core1").feed(c1)
         self.ws.create_blob("core2").feed(c2)
         self.ws.create_blob("index").feed(index)
         self.ws.create_blob("lengths").feed(lengths)

         self.ws.run(op)
         Y = self.ws.blobs[("Y")].fetch()
         self.assertEqual(list(Y.shape), [7, 64])

         golden = np.array([[0] * 64, [768] * 64, [0] * 64, [0] * 64, [512] * 64, [0] * 64, [0] * 64])

         self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=0)


     @given(**hu.gcs_cpu_only)
     def test_tt_sls_gradientop2(self, gc, dc):

         op = core.CreateOperator(
             "TTSparseLengthsSumGradient",
             ["core0", "core1", "core2", "lengths",
              "core0_out", "core1_out", "indices", "dY"],
             ["dCore0", "dCore1", "dCore2"]
         )

         c0 = np.ones([101, 1, 2, 16]).astype(np.float32)
         c1 = np.ones([102, 16, 2, 16]).astype(np.float32)
         c2 = np.ones([153, 16, 4, 1]).astype(np.float32)
         lengths = np.array([0, 3, 0, 0, 2, 0, 0], np.int32)

         c0_out = np.ones([5, 2, 16]).astype(np.float32)
         c1_out = np.ones([5, 4, 16]).astype(np.float32)

         indices = np.array([[0, 0, 0],
                             [1, 0, 0],
                             [2, 0, 0],
                             [1, 0, 0],
                             [4, 0, 0]], np.int64)

         dY = np.ones([7, 16]).astype(np.float32)

         self.ws.create_blob("core0").feed(c0)
         self.ws.create_blob("core1").feed(c1)
         self.ws.create_blob("core2").feed(c2)
         self.ws.create_blob("lengths").feed(lengths)
         self.ws.create_blob("core0_out").feed(c0_out)
         self.ws.create_blob("core1_out").feed(c1_out)
         self.ws.create_blob("indices").feed(indices)
         self.ws.create_blob("dY").feed(dY)

         self.ws.run(op)
         dCore0 = self.ws.blobs[("dCore0")].fetch()
         dCore1 = self.ws.blobs[("dCore1")].fetch()
         dCore2 = self.ws.blobs[("dCore2")].fetch()
         self.assertEqual(list(dCore0.shape), list(c0.shape))
         self.assertEqual(list(dCore1.shape), list(c1.shape))
         self.assertEqual(list(dCore2.shape), list(c2.shape))

     @given(num_workers=st.integers(1, 10),
            net_type=st.sampled_from(
                ["simple", "dag"] +
                (["async_dag"] if workspace.has_gpu_support else [])),
            **hu.gcs)
     @settings(deadline=10000)
     def test_dag_net_forking(self, net_type, num_workers, gc, dc):
         from caffe2.python.model_helper import ModelHelper
         from caffe2.python import brew
         m = ModelHelper(name="test_model")
         n = 10
         d = 2
         depth = 2
         iters = 5
         np.random.seed(1701)
         # Build a binary tree of FC layers, summing at each node.
         for i in reversed(range(depth)):
             for j in range(2 ** i):
                 bottom_1 = "{}_{}".format(i + 1, 2 * j)
                 bottom_2 = "{}_{}".format(i + 1, 2 * j + 1)
                 mid_1 = "{}_{}_m".format(i + 1, 2 * j)
                 mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1)
                 top = "{}_{}".format(i, j)
                 brew.fc(
                     m,
                     bottom_1, mid_1,
                     dim_in=d, dim_out=d,
                     weight_init=('ConstantFill', dict(value=np.random.randn())),
                     bias_init=('ConstantFill', dict(value=np.random.randn())))
                 brew.fc(
                     m,
                     bottom_2, mid_2,
                     dim_in=d, dim_out=d,
                     weight_init=('ConstantFill', dict(value=np.random.randn())),
                     bias_init=('ConstantFill', dict(value=np.random.randn())))
                 m.net.Sum([mid_1, mid_2], top)
         m.net.SquaredL2Distance(["0_0", "label"], "xent")
         m.net.AveragedLoss("xent", "loss")
         input_to_grad = m.AddGradientOperators(["loss"])
         m.Proto().device_option.CopyFrom(gc)
         m.param_init_net.Proto().device_option.CopyFrom(gc)

         m.Proto().type = net_type
         m.Proto().num_workers = num_workers

         self.ws.run(m.param_init_net)

         print(str(m.Proto()))

         def run():
             import numpy as np
             np.random.seed(1701)
             input_blobs = ["{}_{}".format(depth, j) for j in range(2 ** depth)]
             for input_blob in input_blobs:
                 self.ws.create_blob(input_blob).feed(
                     np.random.randn(n, d).astype(np.float32),
                     device_option=gc)
                 self.ws.create_blob("label").feed(
                     np.random.randn(n, d).astype(np.float32),
                     device_option=gc)
             self.ws.run(m.net)
             gradients = [
                 self.ws.blobs[str(input_to_grad[input_blob])].fetch()
                 for input_blob in input_blobs]
             return gradients

         outputs = [run() for _ in range(iters)]
         for output in outputs[1:]:
             np.testing.assert_array_equal(outputs[0], output)
             self.assertAlmostEqual(np.sum(np.square(output)), 91.81752,
                                    delta=1e-2)

     @given(input=hu.tensor(min_dim=2, max_dim=6),
            slice_dim=st.integers(),
            a=st.integers(),
            b=st.integers(),
            is_empty=st.booleans(),
            **hu.gcs_cpu_only)
     @settings(deadline=None, max_examples=50)
     def test_slice(self, input, slice_dim, a, b, is_empty, gc, dc):
         slice_dim = slice_dim % len(input.shape)
         if (is_empty):
             input = np.random.rand(*([0] + list(input.shape))).astype(np.int32)
             slice_dim += 1

         a = a % input.shape[slice_dim]
         b = b % input.shape[slice_dim] + 1
         start_vec = np.zeros(len(input.shape), dtype=np.int32)
         end_vec = np.ones(len(input.shape), dtype=np.int32) * -1
         start_vec[slice_dim] = min(a, b)
         end_vec[slice_dim] = max(a, b)
         op = core.CreateOperator(
             "Slice",
             ["input", "start", "end"],
             ["output"])

         def slice_ref(x, s, e):
             if len(s.shape) == 0:
                 return x
             slc = [slice(si, None if ei == -1 else ei) for si, ei in zip(s, e)]
             return (x[slc], )

         self.assertReferenceChecks(gc, op, [input, start_vec, end_vec],
                                    slice_ref)
         self.assertGradientChecks(gc, op, [input, start_vec, end_vec], 0, [0])

     @given(data=hu.tensor(), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_shape(self, data, gc, dc):
         op = core.CreateOperator("Shape", ["data"], ["shape"])
         self.assertReferenceChecks(gc, op, [data], lambda x: (x.shape, ))

     @given(data=hu.tensor(), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_shape_with_axes(self, data, gc, dc):
         def shape_ref(x, y):
             return ([x.shape[i] for i in y],)
         axes = np.random.randint(len(data.shape), size=10).tolist()
         op = core.CreateOperator("Shape", ["data"], ["shape"], axes=axes)
         self.assertReferenceChecks(gc, op, [data, axes], shape_ref)

     @given(x=hu.tensor(), y=hu.tensor(), **hu.gcs_cpu_only)
     @settings(deadline=1000)
     def test_has_elements(self, x, y, gc, dc):
         op = core.CreateOperator("HasElements", ["x", "y"], ["has_elements"])
         self.assertReferenceChecks(gc, op, [x, y], lambda x, y: (len(x) > 0 or len(y) > 0, ))

         op = core.CreateOperator("IsEmpty", ["x"], ["is_empty"])
         self.assertReferenceChecks(gc, op, [x], lambda x: (len(x) == 0, ))

     @given(initial_iters=st.integers(0, 100),
            max_iters=st.integers(0, 100))
     @settings(deadline=10000)
     def test_should_stop_as_criteria_net_execution_step(
             self, initial_iters, max_iters):
         net = core.Net("net")
         net.Iter(["iter"], ["iter"])
         self.ws.create_blob("iter").feed(
             np.asarray([initial_iters]).astype(np.int64))
         self.ws.create_blob("num_iters").feed(
             np.asarray([max_iters]).astype(np.int64))
         criteria_net = core.Net("criteria")
         criteria_net.GE(["iter", "num_iters"], ["stop"])
         criteria_net.Proto().external_output.extend(["stop"])

         plan = core.Plan('plan')
         plan.AddStep(core.execution_step(
             'step', [criteria_net, net],
             should_stop_blob=core.BlobReference("stop")))
         self.ws.run(plan)
         iters = self.ws.blobs[("iter")].fetch()
         self.assertEqual(iters.dtype, np.int64)
         self.assertEqual(iters[0], max(initial_iters, max_iters))

     def test_disabled_execution_step(self):
         def createNets(i, disabled):
             should_stop = 'should_stop_{}'.format(i)
             output = 'output_{}'.format(i)

             # init content and stop signal
             init = core.Net("init_{}".format(i))
             init.ConstantFill(
                 [],
                 [output],
                 shape=[1],
                 value=0.0
             )
             init.Cast([output], [should_stop], to='bool')

             # decide if disabled or not
             criterion = core.Net("criterion_{}".format(i))
             tmp = criterion.ConstantFill(
                 [],
                 shape=[1],
                 value=1.0 if disabled else 0.0
             )
             criterion.Cast([tmp], [should_stop], to='bool')
             criterion.Proto().external_output.extend([should_stop])

             # the body net is just to turn a 0 blob to 1
             net = core.Net("net_{}".format(i))
             net.ConstantFill(
                 [],
                 [output],
                 shape=[1],
                 value=1.0
             )

             # always end the loop
             ender = core.Net("ender_{}".format(i))
             tmp = ender.ConstantFill(
                 [],
                 shape=[1],
                 value=1.0
             )
             ender.Cast([tmp], [should_stop], to='bool')
             ender.Proto().external_output.extend([should_stop])

             return [init, criterion, net, ender]

         nets = [createNets(1, False),
                 createNets(2, True),
                 createNets(3, False)]
         steps = [
             core.execution_step(
                 'step_1', nets[0],
                 should_stop_blob=core.BlobReference('should_stop_1')),
             core.execution_step(
                 'step_2', nets[1],
                 should_stop_blob=core.BlobReference('should_stop_2')),
             core.execution_step('step_3', nets[2])
         ]
         expected = [1.0, 0.0, 1.0]

         plan = core.Plan('plan')
         plan.AddStep(core.execution_step('all_steps', steps, num_iter=3))
         self.ws.run(plan)

         for i, _ in enumerate(nets):
             self.assertEqual(
                 self.ws.blobs['output_{}'.format(i + 1)].fetch()[0],
                 expected[i])

     @given(initial_iters=st.integers(0, 100),
            num_iters=st.integers(0, 100))
     @settings(deadline=10000)
     def test_iter_count_with_execution_step(self, initial_iters, num_iters):
         net = core.Net("net")
         net.Iter(["iter"], ["iter"])
         self.ws.create_blob("iter").feed(
             np.asarray([initial_iters]).astype(np.int64))

         step = core.ExecutionStep("step", [net])
         step.SetIter(num_iters)

         plan = core.Plan("plan")
         plan.AddStep(step)
         self.ws.run(plan)
         iters = self.ws.blobs[("iter")].fetch()
         self.assertEqual(iters.dtype, np.int64)
         self.assertEqual(iters[0], initial_iters + num_iters)


     @given(initial_iters=st.integers(0, 100),
            num_iters=st.integers(0, 100),
            num_nets=st.integers(0, 5))
     @settings(deadline=None, max_examples=50)
     def test_atomic_iter_with_concurrent_steps(self, initial_iters, num_iters,
                                                num_nets):
         init_net = core.Net("init_net")
         iter_mutex = init_net.CreateMutex([], ["iter_mutex"])
         self.ws.create_blob("iter").feed(
             np.asarray([initial_iters]).astype(np.int64))
         concurrent_steps = core.ExecutionStep("concurrent_steps",
                                               num_iter=num_iters)
         for i in range(num_nets):
             net = core.Net("net_{}".format(i))
             net.AtomicIter([iter_mutex, "iter"], ["iter"])
             step = core.ExecutionStep("step", [net])
             concurrent_steps.AddSubstep(step)

         concurrent_steps.SetConcurrentSubsteps(True)
         plan = core.Plan("plan")
         plan.AddStep(concurrent_steps)

         stats_net = core.Net("stats_net")
         stats_net.StatRegistryExport([], ["stats_key", "stats_val", "stats_ts"])

         self.ws.run(init_net)
         self.ws.run(plan)
         self.ws.run(stats_net)
         iters = self.ws.blobs[("iter")].fetch()
         self.assertEqual(iters.dtype, np.int64)
         self.assertEqual(iters[0], initial_iters + num_iters * num_nets)

         if num_iters * num_nets > 0:
             stats_key = self.ws.blobs[("stats_key")].fetch()
             atomic_iter_key = b'atomic_iter/stats/iter/num_iter'
             self.assertTrue(atomic_iter_key in stats_key)
             stat_val = self.ws.blobs[("stats_val")].fetch()
             self.assertEqual(num_iters * num_nets, stat_val[list(stats_key).index(atomic_iter_key)])


     @given(a=hu.tensor(),
            src=st.sampled_from(list(_NUMPY_TYPE_TO_ENUM.keys())),
            dst=st.sampled_from(list(_NUMPY_TYPE_TO_ENUM.keys())),
            use_name=st.booleans(),
            **hu.gcs)
     @settings(deadline=1000)
     def test_cast(self, a, src, dst, use_name, gc, dc):
         a = a.astype(src)

         # Casting from a float type outside the range of the integral
         # type is UB.
         ftypes = [np.float32, np.float64]
         if src in ftypes and dst not in ftypes and dst is not bool:
             info = np.iinfo(dst)
             a = np.clip(a, info.min, info.max)

         def ref(data):
             return [data.astype(dst)]

         to = _NUMPY_TYPE_TO_ENUM[dst]
         if use_name:
             to = caffe2_pb2.TensorProto.DataType.Name(to).lower()
         op = core.CreateOperator('Cast', ["X"], ["Y"], to=to)
         self.assertDeviceChecks(dc, op, [a], [0])
         out, = self.assertReferenceChecks(gc, op, [a], ref)
         self.assertEqual(dst, out.dtype)

     @given(a=hu.tensor(),
            eps=hu.floats(min_value=1e-4, max_value=1e-2),
            a_grad=hu.tensor(elements=hu.floats(min_value=0.01, max_value=0.99)),
            eps_grad=hu.floats(min_value=1e-4, max_value=1e-3),
            **hu.gcs)
     @settings(deadline=10000)
     def test_logit(self, a, eps, a_grad, eps_grad, gc, dc):
         def ref(data):
             data = np.clip(data, eps, 1.0 - eps)
             return (np.log(data / (1 - data)), )
         # forward testing carried out in the full range of input
         # to ensure original test coverage.
         # gradient test carried out with reduced input range
         # because the sharp increase of the logit curve at 0 and 1
         # error increases dramtically when input is close to 0 or 1
         # and it will fail the test.
         # So we only run gradient test in the range of (0.01, 0.99)
         # very occasionally, test may fail due to random accumulated error
         # reduce test range to (0.02, 0.98) will improve test stability
         op = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps)
         self.assertDeviceChecks(dc, op, [a], [0])
         self.assertReferenceChecks(gc, op, [a], ref)
         op_grad = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps_grad)
         self.assertGradientChecks(gc, op_grad, [a_grad], 0, [0],
                                   threshold=0.04, stepsize=2e-3)

     @given(a=hu.tensor(elements=hu.floats(allow_nan=True)),
            value=hu.floats(min_value=-10, max_value=10),
            **hu.gcs)
     @settings(deadline=1000)
     def test_replace_nan(self, a, value, gc, dc):
         def ref(data):
             out = np.copy(data)
             out[np.isnan(data)] = value
             return (out, )

         op = core.CreateOperator('ReplaceNaN', ["X"], ["Y"], value=value)
         self.assertDeviceChecks(dc, op, [a], [0])
         self.assertReferenceChecks(gc, op, [a], ref)

     @given(data=_dtypes(dtypes=[np.int32, np.int64, np.float32, bool]).
            flatmap(lambda dtype: hu.tensor(
                min_dim=1, dtype=dtype, elements=hu.elements_of_type(dtype))),
            has_input=st.booleans(),
            has_extra_shape=st.booleans(),
            extra_shape=st.lists(
            min_size=1, max_size=5, elements=st.integers(1, 5)),
            **hu.gcs)
     @settings(deadline=10000)
     def test_constant_fill(self, data, has_input, has_extra_shape, extra_shape,
                            gc, dc):
         dtype = data.dtype.type
         # in opt mode, bool is converted into np.bool_
         if data.dtype == np.dtype(bool):
             dtype = bool

         value = data.item(0)
         gt_shape = data.shape
         inputs = [data]
         enum_type = _NUMPY_TYPE_TO_ENUM[dtype]

         if has_input:
             if has_extra_shape:
                 op = core.CreateOperator('ConstantFill', ["X"], ["Y"],
                                          dtype=enum_type,
                                          extra_shape=extra_shape,
                                          value=value)
                 gt_shape += tuple(extra_shape)
             else:
                 op = core.CreateOperator('ConstantFill', ["X"], ["Y"],
                                          dtype=enum_type,
                                          value=value)
         else:
             op = core.CreateOperator('ConstantFill', [], ["Y"],
                                      dtype=enum_type,
                                      value=value,
                                      shape=list(gt_shape))
             inputs = []

         def ref(inputs=None):
             outputs = np.full(shape=gt_shape, fill_value=value, dtype=dtype)
             return [outputs]

         self.assertDeviceChecks(dc, op, inputs, [0])
         out, = self.assertReferenceChecks(gc, op, inputs, ref)
         self.assertEqual(dtype, out.dtype)

     @given(data=_dtypes(dtypes=[np.int32, np.int64, np.float32, bool]).
         flatmap(lambda dtype: hu.tensor(
             min_dim=1, dtype=dtype, elements=hu.elements_of_type(dtype))),
         **hu.gcs)
     @settings(deadline=1000)
     def test_constant_fill_from_tensor(self, data, gc, dc):
         dtype = data.dtype.type
         if data.dtype == np.dtype(bool):
             dtype = bool

         value = np.array([data.item(0)], dtype=dtype)
         inputs = [data, value]
         enum_type = _NUMPY_TYPE_TO_ENUM[dtype]

         op = core.CreateOperator(
             'ConstantFill',
             ["X", "V"],
             ["Y"],
             dtype=enum_type,
         )

         def ref(x, v):
             outputs = np.full(shape=data.shape, fill_value=value[0], dtype=dtype)
             return [outputs]

         self.assertDeviceChecks(dc, op, inputs, [0])
         out, = self.assertReferenceChecks(gc, op, inputs, ref)
         self.assertEqual(dtype, out.dtype)

     @given(t=st.integers(1, 5),
            n=st.integers(1, 5),
            d=st.integers(1, 5))
     @settings(deadline=10000)
     def test_elman_recurrent_network(self, t, n, d):
         from caffe2.python import model_helper, brew
         np.random.seed(1701)
         step_net = model_helper.ModelHelper(name="Elman")
         # TODO: name scope external inputs and outputs
         step_net.Proto().external_input.extend(
             ["input_t", "seq_lengths", "timestep",
              "hidden_t_prev", "gates_t_w", "gates_t_b"])
         step_net.Proto().type = "simple"
         step_net.Proto().external_output.extend(["hidden_t", "gates_t"])
         brew.fc(step_net,
                 "hidden_t_prev", "gates_t", dim_in=d, dim_out=d, axis=2)
         step_net.net.Sum(["gates_t", "input_t"], ["gates_t"])
         step_net.net.Sigmoid(["gates_t"], ["hidden_t"])

         # Initialize params for step net in the parent net
         for op in step_net.param_init_net.Proto().op:
             workspace.RunOperatorOnce(op)

         backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
             step_net.Proto().op, {"hidden_t": "hidden_t_grad"})
         backward_mapping = {
             str(k): str(v) for k, v in backward_mapping.items()
         }
         backward_step_net = core.Net("ElmanBackward")
         del backward_step_net.Proto().op[:]
         backward_step_net.Proto().op.extend(backward_ops)
         assert backward_mapping["input_t"] == "gates_t_grad"
         links = [
             ("hidden_t_prev", "hidden", 0),
             ("hidden_t", "hidden", 1),
             ("input_t", "input", 0),
         ]
         link_internal, link_external, link_offset = zip(*links)
         backward_links = [
             ("hidden_t_prev_grad", "hidden_grad", 0),
             ("hidden_t_grad", "hidden_grad", 1),
             ("gates_t_grad", "input_grad", 0),
         ]
         backward_link_internal, backward_link_external, backward_link_offset = \
             zip(*backward_links)
         backward_step_net.Proto().external_input.extend(["hidden_t_grad"])
         backward_step_net.Proto().external_input.extend(
             step_net.Proto().external_input)
         backward_step_net.Proto().external_input.extend(
             step_net.Proto().external_output)
         inputs = ["input", "seq_lengths", "gates_t_w", "gates_t_b", "hidden_input"]
         recurrent_inputs = ["hidden_input"]
         op = core.CreateOperator(
             "RecurrentNetwork",
             inputs,
             ["output", "hidden", "hidden_output", "step_workspaces"],
             alias_src=["hidden", "hidden"],
             alias_dst=["output", "hidden_output"],
             alias_offset=[1, -1],
             recurrent_states=["hidden"],
             initial_recurrent_state_ids=[
                 inputs.index(i) for i in recurrent_inputs
             ],
             link_internal=link_internal,
             link_external=link_external,
             link_offset=link_offset,
             backward_link_internal=backward_link_internal,
             backward_link_external=backward_link_external,
             backward_link_offset=backward_link_offset,
             param=[inputs.index(p) for p in step_net.params],
             step_net=step_net.Proto(),
             backward_step_net=backward_step_net.Proto(),
             outputs_with_grads=[0],
         )
         workspace.FeedBlob(
             "input", np.random.randn(t, n, d).astype(np.float32))
         workspace.FeedBlob(
             "hidden_input", np.random.randn(1, n, d).astype(np.float32))
         workspace.FeedBlob(
             "seq_lengths", np.random.randint(0, t, size=(n,)).astype(np.int32))

         def reference(input, seq_lengths, gates_w, gates_b, hidden_input):
             T = input.shape[0]
             N = input.shape[1]
             D = input.shape[2]
             hidden = np.zeros(shape=(T + 1, N, D))
             assert hidden.shape[0] == T + 1
             assert hidden.shape[1] == N
             assert hidden.shape[2] == D

             hidden[0, :, :] = hidden_input
             for t in range(T):
                 input_t = input[t].reshape(1, N, D)
                 hidden_t_prev = hidden[t].reshape(1, N, D)
                 gates = np.dot(hidden_t_prev, gates_w.T)
                 gates = gates.reshape(1, N, D) + input_t.reshape(1, N, D)
                 hidden[t + 1] = sigmoid(gates)
             return hidden[1:], hidden, hidden[-1].reshape(1, N, D)

         self.assertReferenceChecks(
             hu.cpu_do,
             op,
             [workspace.FetchBlob(name)
              for name in ["input", "seq_lengths", "gates_t_w", "gates_t_b",
                           "hidden_input"]],
             reference,
             outputs_to_check=[0, 1, 2])

         for param in [0, 2, 3]:
             self.assertGradientChecks(
                 hu.cpu_do,
                 op,
                 [workspace.FetchBlob(name)
                  for name in ["input", "seq_lengths", "gates_t_w", "gates_t_b",
                               "hidden_input"]],
                 param,
                 [0])

     @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
     @given(n=st.integers(1, 5),
            c=st.integers(1, 5),
            h=st.integers(1, 5),
            w=st.integers(1, 5),
            pad=st.integers(0, 2),
            block_size=st.integers(2, 3),
            **hu.gcs)
     def test_space_to_batch(self, n, c, h, w, pad, block_size, gc, dc):
         assume((h + 2 * pad) % block_size == 0)
         assume((w + 2 * pad) % block_size == 0)
         X = np.random.randn(n, c, h, w).astype(np.float32)
         op = core.CreateOperator("SpaceToBatch", ["X"], ["Y"],
                                  pad=pad, block_size=block_size)
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
     @given(n=st.integers(1, 5),
            c=st.integers(1, 5),
            h=st.integers(1, 5),
            w=st.integers(1, 5),
            pad=st.integers(0, 2),
            block_size=st.integers(2, 3),
            **hu.gcs)
     def test_batch_to_space(self, n, c, h, w, pad, block_size, gc, dc):
         assume((h + 2 * pad) % block_size == 0)
         assume((w + 2 * pad) % block_size == 0)
         X = np.random.randn(
             n * block_size * block_size,
             c,
             (h + 2 * pad) // block_size,
             (w + 2 * pad) // block_size).astype(np.float32)
         op = core.CreateOperator("BatchToSpace", ["X"], ["Y"],
                                  pad=pad, block_size=block_size)
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(X=hu.tensor(),
            in_place=st.booleans(),
            scale=hu.floats(min_value=-2.0, max_value=2.0),
            **hu.gcs)
     @settings(deadline=10000)
     def test_scale(self, X, in_place, scale, gc, dc):
         op = core.CreateOperator(
             "Scale", ["X"], ["Y" if not in_place else "X"],
             scale=scale)
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(s=st.text())
     def test_string_serde(self, s):
         s = s.encode('ascii', 'ignore')
         self.ws.create_blob("a").feed(s)
         serialized = self.ws.blobs["a"].serialize("a")
         self.ws.create_blob("b").deserialize(serialized)
         self.assertEqual(s, self.ws.blobs[("a")].fetch())
         self.assertEqual(s, self.ws.blobs[("b")].fetch())

     @given(pad=st.integers(0, 3),
            size=st.integers(1, 10),
            input_channels=st.integers(1, 5),
            batch_size=st.integers(1, 5),
            order=st.sampled_from(["NCHW", "NHWC"]),
            mode=st.sampled_from(["constant", "reflect", "edge"]),
            **hu.gcs)
     @settings(deadline=None, max_examples=50)
     def test_same_pad_image(self, pad, size, input_channels, batch_size, order,
                             mode, gc, dc):
         assume(size > pad)

         op = core.CreateOperator(
             "PadImage",
             ["X"],
             ["Y"],
             pad=pad,
             mode=mode,
             order=order,
         )
         if order == "NHWC":
             X = np.random.rand(
                 batch_size, size, size, input_channels).astype(np.float32) - 0.5

             def numpy_pad_ref(x):
                 return (np.pad(
                     x, ((0, 0), (pad, pad), (pad, pad), (0, 0)), mode),)

         else:
             X = np.random.rand(
                 batch_size, input_channels, size, size).astype(np.float32) - 0.5

             def numpy_pad_ref(x):
                 return (np.pad(
                     x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode),)

         self.assertReferenceChecks(gc, op, [X], numpy_pad_ref)
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(pad_t=st.integers(0, 3),
            pad_l=st.integers(0, 3),
            pad_b=st.integers(0, 3),
            pad_r=st.integers(0, 3),
            size=st.integers(1, 10),
            input_channels=st.integers(1, 5),
            batch_size=st.integers(1, 5),
            order=st.sampled_from(["NCHW", "NHWC"]),
            mode=st.sampled_from(["constant", "reflect", "edge"]),
            **hu.gcs)
     @settings(deadline=None, max_examples=50)
     def test_pad_image(self, pad_t, pad_l, pad_b, pad_r, size, input_channels,
                        batch_size, order, mode, gc, dc):
         assume(size > max(pad_b, pad_r, pad_t, pad_l))

         op = core.CreateOperator(
             "PadImage",
             ["X"],
             ["Y"],
             pad_t=pad_t,
             pad_l=pad_l,
             pad_b=pad_b,
             pad_r=pad_r,
             mode=mode,
             order=order,
         )
         if order == "NHWC":
             X = np.random.rand(
                 batch_size, size, size, input_channels).astype(np.float32) - 0.5

             def numpy_pad_ref(x):
                 return (np.pad(
                     x, ((0, 0), (pad_t, pad_b), (pad_l, pad_r), (0, 0)),
                     mode),)

         else:
             X = np.random.rand(
                 batch_size, input_channels, size, size).astype(np.float32) - 0.5

             def numpy_pad_ref(x):
                 return (np.pad(
                     x, ((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)),
                     mode),)

         self.assertReferenceChecks(gc, op, [X], numpy_pad_ref)
         self.assertDeviceChecks(dc, op, [X], [0])
         self.assertGradientChecks(gc, op, [X], 0, [0])

     @given(size=st.integers(7, 10),
            input_channels=st.integers(1, 10),
            batch_size=st.integers(1, 3),
            order=st.sampled_from(["NCHW", "NHWC"]),
            epsilon=hu.floats(min_value=1e-4, max_value=1e-2),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_instance_norm(self, size, input_channels, batch_size, order,
                            epsilon, gc, dc):
         op = core.CreateOperator(
             "InstanceNorm",
             ["X", "scale", "bias"],
             ["Y"],
             order=order,
             epsilon=epsilon,
         )
         np.random.seed(1701)
         scale = np.random.rand(input_channels).astype(np.float32) + 0.5
         bias = np.random.rand(input_channels).astype(np.float32) - 0.5
         X = np.random.rand(
             batch_size, input_channels, size, size).astype(np.float32) - 0.5
         if order == "NHWC":
             X = X.swapaxes(1, 2).swapaxes(2, 3)

         def ref_nchw(x, scale, bias):
             x = x.reshape(batch_size * input_channels, size * size)
             y = (x - x.mean(1)[:, np.newaxis])
             y /= np.sqrt(x.var(1) + epsilon)[:, np.newaxis]
             y = y.reshape(batch_size, input_channels, size, size)
             y = y * scale.reshape(1, input_channels, 1, 1)
             y = y + bias.reshape(1, input_channels, 1, 1)
             return (y, )

         def ref_nhwc(x, scale, bias):
             x = x.swapaxes(2, 3).swapaxes(1, 2)
             y = ref_nchw(x, scale, bias)[0]
             return (y.swapaxes(1, 2).swapaxes(2, 3), )

         self.assertReferenceChecks(
             gc, op, [X, scale, bias],
             ref_nchw if order == "NCHW" else ref_nhwc)
         # TODO(jiayq): when there are backward and GPU implementations, enable
         # these two.
         # self.assertDeviceChecks(dc, op, [X, scale, bias], [0])
         # self.assertGradientChecks(gc, op, [X, scale, bias], 0, [0])

         ws = workspace.C.Workspace()
         feeds = [("X", X), ("scale", scale), ("bias", bias)]
         for blob, arr in feeds:
             ws.create_blob(blob).feed(arr)
         for _ in range(100):
             ws.run(op)
         for blob, arr in feeds:
             np.testing.assert_array_equal(ws.blobs[blob].fetch(), arr)

     @given(inp=_dtypes().flatmap(lambda dt: _tensor_and_indices(
         elements=hu.elements_of_type(dt), dtype=dt)),
         **hu.gcs)
     @settings(deadline=10000)
     def test_sparse_to_dense(self, inp, gc, dc):
         first_dim, X, I = inp
         if X.dtype != np.dtype('float32') and gc.device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP} :
             # Cuda only support 32 bit float
             print("Bailout {}".format(X.dtype))
             return
         if gc.device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP}:
             # Cuda version only support int32
             I = I.astype(np.int32)

         if X.dtype in (np.dtype('int64'), np.dtype('int32')):
             assume((np.abs(X.ravel()).max() < np.iinfo('int32').max).all())
             assume(np.abs(X.ravel()).astype(np.int64).sum() < np.iinfo('int32').max)

         # values don't matter
         D = np.zeros((first_dim,) + X.shape[1:]).astype(X.dtype)

         op = core.CreateOperator("SparseToDense", ["I", "X", "D"], ["Y"])
         op_noshapeinfer = core.CreateOperator("SparseToDense", ["I", "X"], ["Y"])

         def sparse_to_dense(I, X, D):
             O = np.zeros(D.shape, dtype=X.dtype)
             for i, p in enumerate(I):
                 O[p] += X[i]
             return [O]

         def sparse_to_dense_noshapeinfer(I, X):
             O = np.zeros((np.max(I) + 1,) + X.shape[1:], dtype=X.dtype)
             for i, p in enumerate(I):
                 O[p] += X[i]
             return [O]

         self.assertReferenceChecks(gc, op, [I, X, D], sparse_to_dense)
         self.assertReferenceChecks(gc, op_noshapeinfer, [I, X], sparse_to_dense_noshapeinfer)
         if X.dtype == np.float32:
             self.assertGradientChecks(gc, op, [I, X, D], 1, [0])

     @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_dot_product(self, inputs, gc, dc):
         X, Y = inputs
         op = core.CreateOperator("DotProduct", ["X", "Y"], 'out')

         def dotproduct(X, Y):
             return (np.sum(X * Y, axis=1), )

         self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 0, [0])
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])

     @given(N=st.integers(min_value=2, max_value=10),
            M=st.integers(min_value=2, max_value=10),
            K=st.integers(min_value=2, max_value=10),
            pad_value=hu.floats(min_value=0.1, max_value=1.0),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_dot_product_with_padding(self, N, M, K, pad_value, gc, dc):
         X = np.random.rand(N, M).astype(np.float32) - 0.5
         Y = np.random.rand(N, K).astype(np.float32) - 0.5
         op = core.CreateOperator("DotProductWithPadding", ["X", "Y"], 'out',
                                  pad_value=pad_value)

         def dotproduct(X, Y):
             Z = np.ones((N, max(M, K))).astype(np.float32) * pad_value
             if M < K:
                 Z[:, :M] = X
                 return (np.sum(Z * Y, axis=1), )
             else:
                 Z[:, :K] = Y
                 return (np.sum(Z * X, axis=1), )

         self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 0, [0])
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])

     @given(N=st.integers(min_value=2, max_value=10),
            M=st.integers(min_value=2, max_value=10),
            pad_value=hu.floats(min_value=0.1, max_value=1.0),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_dot_product_with_rep_padding(self, N, M, pad_value, gc, dc):
         K = 2 * M
         X = np.random.rand(N, M).astype(np.float32) - 0.5
         Y = np.random.rand(N, K).astype(np.float32) - 0.5
         op = core.CreateOperator("DotProductWithPadding", ["X", "Y"], 'out',
                                  replicate=True,
                                  pad_value=pad_value)

         def dotproduct(X, Y):
             import numpy.matlib as npm
             if M < K:
                 Z = npm.repmat(X, 1, K // M)
                 return (np.sum(Z * Y, axis=1), )
             else:
                 Z = npm.repmat(Y, 1, M // K)
                 return (np.sum(Z * X, axis=1), )

         self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
         self.assertDeviceChecks(dc, op, [X, Y], [0])
         self.assertGradientChecks(gc, op, [X, Y], 0, [0])
         self.assertGradientChecks(gc, op, [X, Y], 1, [0])

     @given(N=st.integers(min_value=2, max_value=10),
            M=st.integers(min_value=2, max_value=10), **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_ensure_dense(self, N, M, gc, dc):
         # in place
         X = np.random.rand(N, M).astype(np.float32) - 0.5
         op = core.CreateOperator("EnsureDense", ["X"], "X")
         self.assertReferenceChecks(gc, op, [X], lambda x: [x])
         self.assertDeviceChecks(dc, op, [X], [0])
         # or not
         X = np.random.rand(N, M).astype(np.float32) - 0.5
         op = core.CreateOperator("EnsureDense", ["X"], "out")
         self.assertReferenceChecks(gc, op, [X], lambda x: [x])
         self.assertDeviceChecks(dc, op, [X], [0])

     @given(N=st.integers(min_value=10, max_value=100),
            M=st.integers(min_value=2, max_value=10),
            num_buckets=st.integers(min_value=1, max_value=5),
            **hu.gcs_cpu_only)
     @settings(deadline=10000)
     def test_accumulate_histogram_op(self, N, M, num_buckets, gc, dc):
         X = np.random.rand(N, M).astype(np.float32)
         lower_bound, upper_bound = 0.1, 0.9
         op = core.CreateOperator("AccumulateHistogram", ["X"],
                                  ['cur_hist', 'acc_hist'],
                                  lower_bound=lower_bound,
                                  upper_bound=upper_bound,
                                  num_buckets=num_buckets)

         def histogram(X):
             hist = np.zeros((num_buckets + 2, ), dtype=np.int32)
             segment = (upper_bound - lower_bound) / num_buckets
             Y = np.zeros((N, M), dtype=np.int32)
             Y[X < lower_bound] = 0
             Y[X >= upper_bound] = num_buckets + 1
             Y[(X >= lower_bound) & (X < upper_bound)] = \
                 ((X[(X >= lower_bound) & (X < upper_bound)] - lower_bound) /
                     segment + 1).astype(np.int32)

             for i in range(Y.shape[0]):
                 for j in range(Y.shape[1]):
                     hist[Y[i][j]] += 1
             cur_hist, acc_hist = hist, hist

             return [cur_hist, acc_hist]

         self.assertDeviceChecks(dc, op, [X], [0, 1])
         self.assertReferenceChecks(gc, op, [X], histogram)

     @settings(max_examples=1, deadline=None)
     @given(
         queue_capacity=st.integers(2, 2),
         time_sleep=st.integers(5, 10),
         num_blobs_to_equeue=st.integers(1, 1),
         num_blobs_to_dequeue=st.integers(2, 2),
     )
     def test_safe_dequeue_blob__raises_exception_when_hang(
         self,
         queue_capacity,
         time_sleep,
         num_blobs_to_equeue,
         num_blobs_to_dequeue,
     ):
         r"""
         Tests SafeDequeueBlobsOp being cancellable.

         Create a queue with the number of BlobsQueue less than the number
         SafeDequeueBlobs to cause the hanging behavior when running the Net.

         Then call cancel from the previous sleeping thread to ensure exception
         is raised.
         """

         def _net_instance_cancel(net_instance):
             time.sleep(time_sleep)
             net_instance.cancel()

         init_net = core.Net("init_net")
         init_net.Proto().type = "async_scheduling"

         queue = init_net.CreateBlobsQueue(
             [],
             "queue_name",
             capacity=queue_capacity,
             num_blobs=num_blobs_to_equeue,
         )

         ws = workspace.Workspace()
         ws.create_net(init_net).run()

         net = core.Net("net")
         net.Proto().type = "async_scheduling"

         blobs = net.SafeDequeueBlobs([queue], num_blobs_to_dequeue)

         net_instance = ws.create_net(net)

         t = threading.Thread(target=_net_instance_cancel, args=[net_instance])
         t.start()

         with self.assertRaises(Exception):
             net_instance.run()
             t.join()


 if __name__ == "__main__":
     unittest.main()