mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
e773f28ee3
87 Commits
| Author | SHA1 | Message | Date | |
|---|---|---|---|---|
|
|
e773f28ee3 |
Reland "Add forward mode AD to out-place foreach functions (#102409) (#106043)
forward-mode AD of out-of-place foreach functions, finally. rel: - #102409 - #105504 - #58833 - #100695 --- # Generated Foreach ```c++ ::std::vector<at::Tensor> _foreach_sinh(c10::DispatchKeySet ks, at::TensorList self) { auto self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); std::vector<bool> _any_has_forward_grad_result(self.size()); for (const auto& i : c10::irange(self.size())) { _any_has_forward_grad_result[i] = isFwGradDefined(self[i]); } std::shared_ptr<ForeachSinhBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<ForeachSinhBackward0>(new ForeachSinhBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->self_ = make_saved_variable_list(self); grad_fn->self_size_ = self.size(); } #ifndef NDEBUG std::vector<c10::optional<Storage>> self__storage_saved(self_.size()); for (const Tensor& tensor : self_) self__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size()); for (size_t i=0; i<self_.size(); i++) if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::_foreach_sinh(ks & c10::after_autograd_keyset, self_); })(); auto result = std::move(_tmp); #ifndef NDEBUG for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage())); } for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr()); } #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } std::vector<c10::optional<at::Tensor>> result_new_fw_grad_opts(self.size(), c10::nullopt); for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { if (_any_has_forward_grad_result[i]) { auto self_t_raw = toNonOptFwGrad(self[i]); auto self_tensor = toNonOptTensor(self[i]); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self[i]); result_new_fw_grad_opts[i] = (self_t.conj() * self_p.cosh().conj()).conj(); } } for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { auto& result_new_fw_grad_opt = result_new_fw_grad_opts[i]; if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result[i].defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result[i]._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } } return result; } ::std::vector<at::Tensor> _foreach_norm_Scalar(c10::DispatchKeySet ks, at::TensorList self, const at::Scalar & ord) { auto self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); std::vector<bool> _any_has_forward_grad_result(self.size()); for (const auto& i : c10::irange(self.size())) { _any_has_forward_grad_result[i] = isFwGradDefined(self[i]); } std::shared_ptr<ForeachNormBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<ForeachNormBackward0>(new ForeachNormBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->ord = ord; grad_fn->self_ = make_saved_variable_list(self); grad_fn->self_size_ = self.size(); } #ifndef NDEBUG std::vector<c10::optional<Storage>> self__storage_saved(self_.size()); for (const Tensor& tensor : self_) self__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size()); for (size_t i=0; i<self_.size(); i++) if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::_foreach_norm(ks & c10::after_autograd_keyset, self_, ord); })(); auto result = std::move(_tmp); #ifndef NDEBUG for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage())); } for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr()); } #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } std::vector<c10::optional<at::Tensor>> result_new_fw_grad_opts(self.size(), c10::nullopt); for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { if (_any_has_forward_grad_result[i]) { auto self_t_raw = toNonOptFwGrad(self[i]); auto self_tensor = toNonOptTensor(self[i]); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self[i]); result_new_fw_grad_opts[i] = norm_jvp(self_p, self_t, ord, result[i]); } } for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { auto& result_new_fw_grad_opt = result_new_fw_grad_opts[i]; if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result[i].defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result[i]._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } } if (grad_fn) { grad_fn->result = result; } return result; } ``` # Reference ```c++ at::Tensor sinh(c10::DispatchKeySet ks, const at::Tensor & self) { auto& self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); [[maybe_unused]] auto _any_has_forward_grad_result = (isFwGradDefined(self)); std::shared_ptr<SinhBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<SinhBackward0>(new SinhBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->self_ = SavedVariable(self, false); } #ifndef NDEBUG c10::optional<Storage> self__storage_saved = self_.has_storage() ? c10::optional<Storage>(self_.storage()) : c10::nullopt; c10::intrusive_ptr<TensorImpl> self__impl_saved; if (self_.defined()) self__impl_saved = self_.getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::sinh(ks & c10::after_autograd_keyset, self_); })(); auto result = std::move(_tmp); #ifndef NDEBUG if (self__storage_saved.has_value() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved.value().is_alias_of(self_.storage())); if (self__impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved == self_.getIntrusivePtr()); if (result.has_storage() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) { TORCH_INTERNAL_ASSERT(result.storage().use_count() == 1, "function: sinh"); } if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) TORCH_INTERNAL_ASSERT(result.use_count() <= 1, "function: sinh"); #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } c10::optional<at::Tensor> result_new_fw_grad_opt = c10::nullopt; if (_any_has_forward_grad_result && (result.defined())) { auto self_t_raw = toNonOptFwGrad(self); auto self_tensor = toNonOptTensor(self); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self); result_new_fw_grad_opt = (self_t.conj() * self_p.cosh().conj()).conj(); } if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result.defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } return result; } at::Tensor norm_Scalar(c10::DispatchKeySet ks, const at::Tensor & self, const at::Scalar & p) { auto& self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); [[maybe_unused]] auto _any_has_forward_grad_result = (isFwGradDefined(self)); std::shared_ptr<NormBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<NormBackward0>(new NormBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->p = p; grad_fn->self_ = SavedVariable(self, false); } #ifndef NDEBUG c10::optional<Storage> self__storage_saved = self_.has_storage() ? c10::optional<Storage>(self_.storage()) : c10::nullopt; c10::intrusive_ptr<TensorImpl> self__impl_saved; if (self_.defined()) self__impl_saved = self_.getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::norm(ks & c10::after_autograd_keyset, self_, p); })(); auto result = std::move(_tmp); #ifndef NDEBUG if (self__storage_saved.has_value() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved.value().is_alias_of(self_.storage())); if (self__impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved == self_.getIntrusivePtr()); if (result.has_storage() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) { TORCH_INTERNAL_ASSERT(result.storage().use_count() == 1, "function: norm_Scalar"); } if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) TORCH_INTERNAL_ASSERT(result.use_count() <= 1, "function: norm_Scalar"); #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } throw_error_for_complex_autograd(result, "norm"); c10::optional<at::Tensor> result_new_fw_grad_opt = c10::nullopt; if (_any_has_forward_grad_result && (result.defined())) { auto self_t_raw = toNonOptFwGrad(self); auto self_tensor = toNonOptTensor(self); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self); result_new_fw_grad_opt = norm_jvp(self_p, self_t, p, result); } if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result.defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } if (grad_fn) { grad_fn->result_ = SavedVariable(result, true); } return result; } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/106043 Approved by: https://github.com/soulitzer |
||
|
|
72f2c87a5a |
[foreach] Set SavedVariable.is_output to true for grad_fn->result_ (#105504)
fixes #105502 The scope of this pull request is out-of-place foreach functions that depend on their output tensorlist for backward such as `_foreach_exp`. An example of the generated code with this update is as follows: ```c++ variable_list ForeachExpBackward0::apply(variable_list&& grads) { std::lock_guard<std::mutex> lock(mutex_); TORCH_CHECK(!result_released_, ERR_BACKWARD_TWICE); IndexRangeGenerator gen; auto self_ix = gen.range(self_size_); variable_list grad_inputs(gen.size()); auto result = unpack_list(result_, shared_from_this()); if (task_should_compute_output({ self_ix })) { std::vector<Tensor> grad_result; grad_result.reserve(grads.size()); for (const auto & i : c10::irange(grads.size())) { if (grads[i].defined()) { grad_result.emplace_back(grads[i] * result[i].conj()); } else { grad_result.emplace_back(Tensor()); } } copy_range(grad_inputs, self_ix, grad_result); } return grad_inputs; } ::std::vector<at::Tensor> _foreach_exp(c10::DispatchKeySet ks, at::TensorList self) { auto self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); std::shared_ptr<ForeachExpBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<ForeachExpBackward0>(new ForeachExpBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->self_size_ = self.size(); } #ifndef NDEBUG std::vector<c10::optional<Storage>> self__storage_saved(self_.size()); for (const Tensor& tensor : self_) self__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size()); for (size_t i=0; i<self_.size(); i++) if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr(); #endif auto _tmp = ([&]() { if ((isFwGradDefinedTensorList(self))) { static c10::OperatorName full_name("aten::_foreach_exp", ""); static c10::optional<c10::OperatorHandle> opt_op = c10::Dispatcher::singleton().findSchema(full_name); return impl::run_jit_decomposition_with_args_for_jvp<::std::vector<at::Tensor>>("_foreach_exp", *opt_op, ks, self); } else { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::_foreach_exp(ks & c10::after_autograd_keyset, self_); } })(); auto result = std::move(_tmp); #ifndef NDEBUG for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage())); } for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr()); } #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } if (grad_fn) { grad_fn->result_ = make_saved_variable_list(result, true); } return result; } ``` A bit of context: - https://github.com/pytorch/pytorch/pull/105368#issuecomment-1640912479 Pull Request resolved: https://github.com/pytorch/pytorch/pull/105504 Approved by: https://github.com/soulitzer |
||
|
|
803d42e457 |
add lerp cpu support for half (#105607)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/105607 Approved by: https://github.com/albanD |
||
|
|
73e1455327 |
[BE] Enable ruff's UP rules and autoformat test/ (#105434)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/105434 Approved by: https://github.com/albanD |
||
|
|
8958f041be |
Revert "Add forward mode AD to out-place foreach functions (#102409)"
This reverts commit |
||
|
|
e2ec0ba404 |
Add forward mode AD to out-place foreach functions (#102409)
The major difference from in-place support is that some out-place functions have their derivatives spelled out in derivatives.yaml, which requires some changes in `load_derivatives.py` and some handlings in various places due to the others whose derivatives are generated by `torchgen`. rel: - #58833 - #100695 --- # Generated Foreach ```c++ ::std::vector<at::Tensor> _foreach_sinh(c10::DispatchKeySet ks, at::TensorList self) { auto self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); std::vector<bool> _any_has_forward_grad_result(self.size()); for (const auto& i : c10::irange(self.size())) { _any_has_forward_grad_result[i] = isFwGradDefined(self[i]); } std::shared_ptr<ForeachSinhBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<ForeachSinhBackward0>(new ForeachSinhBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->self_ = make_saved_variable_list(self); grad_fn->self_size_ = self.size(); } #ifndef NDEBUG std::vector<c10::optional<Storage>> self__storage_saved(self_.size()); for (const Tensor& tensor : self_) self__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size()); for (size_t i=0; i<self_.size(); i++) if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::_foreach_sinh(ks & c10::after_autograd_keyset, self_); })(); auto result = std::move(_tmp); #ifndef NDEBUG for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage())); } for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr()); } #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } std::vector<c10::optional<at::Tensor>> result_new_fw_grad_opts(self.size(), c10::nullopt); for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { if (_any_has_forward_grad_result[i]) { auto self_t_raw = toNonOptFwGrad(self[i]); auto self_tensor = toNonOptTensor(self[i]); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self[i]); result_new_fw_grad_opts[i] = (self_t.conj() * self_p.cosh().conj()).conj(); } } for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { auto& result_new_fw_grad_opt = result_new_fw_grad_opts[i]; if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result[i].defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result[i]._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } } return result; } ::std::vector<at::Tensor> _foreach_norm_Scalar(c10::DispatchKeySet ks, at::TensorList self, const at::Scalar & ord) { auto self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); std::vector<bool> _any_has_forward_grad_result(self.size()); for (const auto& i : c10::irange(self.size())) { _any_has_forward_grad_result[i] = isFwGradDefined(self[i]); } std::shared_ptr<ForeachNormBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<ForeachNormBackward0>(new ForeachNormBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->ord = ord; grad_fn->self_ = make_saved_variable_list(self); grad_fn->self_size_ = self.size(); } #ifndef NDEBUG std::vector<c10::optional<Storage>> self__storage_saved(self_.size()); for (const Tensor& tensor : self_) self__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size()); for (size_t i=0; i<self_.size(); i++) if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::_foreach_norm(ks & c10::after_autograd_keyset, self_, ord); })(); auto result = std::move(_tmp); #ifndef NDEBUG for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage())); } for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr()); } #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } std::vector<c10::optional<at::Tensor>> result_new_fw_grad_opts(self.size(), c10::nullopt); for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { if (_any_has_forward_grad_result[i]) { auto self_t_raw = toNonOptFwGrad(self[i]); auto self_tensor = toNonOptTensor(self[i]); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self[i]); result_new_fw_grad_opts[i] = norm_jvp(self_p, self_t, ord, result[i]); } } for (const auto& i : c10::irange(result_new_fw_grad_opts.size())) { auto& result_new_fw_grad_opt = result_new_fw_grad_opts[i]; if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result[i].defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result[i]._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } } if (grad_fn) { grad_fn->result = result; } return result; } ``` # Reference ```c++ at::Tensor sinh(c10::DispatchKeySet ks, const at::Tensor & self) { auto& self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); [[maybe_unused]] auto _any_has_forward_grad_result = (isFwGradDefined(self)); std::shared_ptr<SinhBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<SinhBackward0>(new SinhBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->self_ = SavedVariable(self, false); } #ifndef NDEBUG c10::optional<Storage> self__storage_saved = self_.has_storage() ? c10::optional<Storage>(self_.storage()) : c10::nullopt; c10::intrusive_ptr<TensorImpl> self__impl_saved; if (self_.defined()) self__impl_saved = self_.getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::sinh(ks & c10::after_autograd_keyset, self_); })(); auto result = std::move(_tmp); #ifndef NDEBUG if (self__storage_saved.has_value() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved.value().is_alias_of(self_.storage())); if (self__impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved == self_.getIntrusivePtr()); if (result.has_storage() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) { TORCH_INTERNAL_ASSERT(result.storage().use_count() == 1, "function: sinh"); } if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) TORCH_INTERNAL_ASSERT(result.use_count() <= 1, "function: sinh"); #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } c10::optional<at::Tensor> result_new_fw_grad_opt = c10::nullopt; if (_any_has_forward_grad_result && (result.defined())) { auto self_t_raw = toNonOptFwGrad(self); auto self_tensor = toNonOptTensor(self); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self); result_new_fw_grad_opt = (self_t.conj() * self_p.cosh().conj()).conj(); } if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result.defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } return result; } at::Tensor norm_Scalar(c10::DispatchKeySet ks, const at::Tensor & self, const at::Scalar & p) { auto& self_ = unpack(self, "self", 0); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self ); [[maybe_unused]] auto _any_has_forward_grad_result = (isFwGradDefined(self)); std::shared_ptr<NormBackward0> grad_fn; if (_any_requires_grad) { grad_fn = std::shared_ptr<NormBackward0>(new NormBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self )); grad_fn->p = p; grad_fn->self_ = SavedVariable(self, false); } #ifndef NDEBUG c10::optional<Storage> self__storage_saved = self_.has_storage() ? c10::optional<Storage>(self_.storage()) : c10::nullopt; c10::intrusive_ptr<TensorImpl> self__impl_saved; if (self_.defined()) self__impl_saved = self_.getIntrusivePtr(); #endif auto _tmp = ([&]() { at::AutoDispatchBelowADInplaceOrView guard; return at::redispatch::norm(ks & c10::after_autograd_keyset, self_, p); })(); auto result = std::move(_tmp); #ifndef NDEBUG if (self__storage_saved.has_value() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved.value().is_alias_of(self_.storage())); if (self__impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved == self_.getIntrusivePtr()); if (result.has_storage() && !at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) { TORCH_INTERNAL_ASSERT(result.storage().use_count() == 1, "function: norm_Scalar"); } if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(result)) TORCH_INTERNAL_ASSERT(result.use_count() <= 1, "function: norm_Scalar"); #endif if (grad_fn) { set_history(flatten_tensor_args( result ), grad_fn); } throw_error_for_complex_autograd(result, "norm"); c10::optional<at::Tensor> result_new_fw_grad_opt = c10::nullopt; if (_any_has_forward_grad_result && (result.defined())) { auto self_t_raw = toNonOptFwGrad(self); auto self_tensor = toNonOptTensor(self); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::_efficientzerotensor(self_tensor.sizes(), self_tensor.options()); auto self_p = toNonOptPrimal(self); result_new_fw_grad_opt = norm_jvp(self_p, self_t, p, result); } if (result_new_fw_grad_opt.has_value() && result_new_fw_grad_opt.value().defined() && result.defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. result._set_fw_grad(result_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ false); } if (grad_fn) { grad_fn->result_ = SavedVariable(result, true); } return result; } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/102409 Approved by: https://github.com/soulitzer |
||
|
|
6d2887cc06 |
Reland "Move tensor grouping to ATen" (#103912)
This is a reland of https://github.com/pytorch/pytorch/pull/100007 with a build fix for Windows debug builds.
`at::native::ParamsHash` only works on structs with standard layout, but `std::string` isn't one in Visual C++ debug builds, which one can easily verified by running something like:
```cpp
#define _DEBUG
#include <type_traits>
#include <string>
static_assert(std::is_standard_layout_v<std::string>, "Oh noes");
```
If above conditon is not met, instead of printing a static_assert output, VC++ raises a very cryptic compilation errors, see https://github.com/pytorch/pytorch/pull/100007#discussion_r1227116292 for more detail.
Also, using `std::hash` for string should result in a faster hash function.
(cherry picked from commit
|
||
|
|
0cb5bc3b04 |
Revert "Move tensor grouping to ATen (#100007)"
This reverts commit
|
||
|
|
74b7a6c75e |
Move tensor grouping to ATen (#100007)
rel: #94344 Pull Request resolved: https://github.com/pytorch/pytorch/pull/100007 Approved by: https://github.com/janeyx99 |
||
|
|
0bb2b01541 |
Add forward mode AD to in-place foreach functions (#100695)
Awkwardly implement fwd AD by - adding a few `CodeTemplate`s - allowing for the cases where a variable is initialized with i-th element of TensorList <!-- ### TODOs: - [x] ~~remove the first `_any_has_forward_grad_self`~~ make it a vector of bool - [ ] clean up mapping of names from reference impl to foreach impl - [x] add tests --> ### Rel: - #58833 - #96405 --- `_foreach_addcmul_.ScalarList` from `VariableType` ```c++ void _foreach_addcmul__ScalarList(c10::DispatchKeySet ks, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) { auto self_ = unpack(self, "self", 0); auto tensor1_ = unpack(tensor1, "tensor1", 1); auto tensor2_ = unpack(tensor2, "tensor2", 2); [[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self, tensor1, tensor2 ); std::vector<bool> _any_has_forward_grad_self(self.size()); for (const auto& i : c10::irange(self.size())) { _any_has_forward_grad_self[i] = isFwGradDefined(self[i]) || isFwGradDefined(tensor1[i]) || isFwGradDefined(tensor2[i]); } std::vector<c10::optional<at::Tensor>> original_selfs(self.size()); std::vector<std::shared_ptr<AddcmulBackward0>> grad_fns; if (_any_requires_grad) { for (const auto& i : c10::irange( self.size() )) { const auto ith_requires_grad = compute_requires_grad(self[i], tensor1[i], tensor2[i]); check_inplace(self[i], ith_requires_grad); grad_fns.push_back([&]() -> std::shared_ptr<AddcmulBackward0> { if (!ith_requires_grad) { return nullptr; } else { auto grad_fn = std::shared_ptr<AddcmulBackward0>(new AddcmulBackward0(), deleteNode); grad_fn->set_next_edges(collect_next_edges( self[i], tensor1[i], tensor2[i] )); return grad_fn; } }()); } if (!grad_fns.empty()) { for (const auto& i : c10::irange(grad_fns.size())) { auto grad_fn = grad_fns[i]; if (grad_fn != nullptr) { grad_fn->self_scalar_type = self[i].scalar_type(); grad_fn->tensor1_scalar_type = tensor1[i].scalar_type(); if (grad_fn->should_compute_output(1)) { grad_fn->tensor2_ = SavedVariable(tensor2[i], false); } grad_fn->value = scalars[i]; if (grad_fn->should_compute_output(2)) { grad_fn->tensor1_ = SavedVariable(tensor1[i], false); } grad_fn->tensor2_scalar_type = tensor2[i].scalar_type(); } } } } #ifndef NDEBUG std::vector<c10::optional<Storage>> self__storage_saved(self_.size()); for (const Tensor& tensor : self_) self__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size()); for (size_t i=0; i<self_.size(); i++) if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr(); std::vector<c10::optional<Storage>> tensor1__storage_saved(tensor1_.size()); for (const Tensor& tensor : tensor1_) tensor1__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> tensor1__impl_saved(tensor1_.size()); for (size_t i=0; i<tensor1_.size(); i++) if (tensor1_[i].defined()) tensor1__impl_saved[i] = tensor1_[i].getIntrusivePtr(); std::vector<c10::optional<Storage>> tensor2__storage_saved(tensor2_.size()); for (const Tensor& tensor : tensor2_) tensor2__storage_saved.push_back( tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt); std::vector<c10::intrusive_ptr<TensorImpl>> tensor2__impl_saved(tensor2_.size()); for (size_t i=0; i<tensor2_.size(); i++) if (tensor2_[i].defined()) tensor2__impl_saved[i] = tensor2_[i].getIntrusivePtr(); #endif { at::AutoDispatchBelowAutograd guard; at::redispatch::_foreach_addcmul_(ks & c10::after_autograd_keyset, self_, tensor1_, tensor2_, scalars); } #ifndef NDEBUG for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage())); } for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_)) TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr()); } for (size_t i=0; i<tensor1_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (tensor1__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(tensor1_)) TORCH_INTERNAL_ASSERT(tensor1__storage_saved[i].value().is_alias_of(tensor1_[i].storage())); } for (size_t i=0; i<tensor1_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (tensor1__impl_saved[i] && !at::impl::tensorlist_has_dispatch(tensor1_)) TORCH_INTERNAL_ASSERT(tensor1__impl_saved[i] == tensor1_[i].getIntrusivePtr()); } for (size_t i=0; i<tensor2_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (tensor2__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(tensor2_)) TORCH_INTERNAL_ASSERT(tensor2__storage_saved[i].value().is_alias_of(tensor2_[i].storage())); } for (size_t i=0; i<tensor2_.size() && !at::impl::dispatch_mode_enabled(); i++) { if (tensor2__impl_saved[i] && !at::impl::tensorlist_has_dispatch(tensor2_)) TORCH_INTERNAL_ASSERT(tensor2__impl_saved[i] == tensor2_[i].getIntrusivePtr()); } #endif if (!grad_fns.empty()) { auto differentiable_outputs = flatten_tensor_args( self ); TORCH_INTERNAL_ASSERT(differentiable_outputs.size() == grad_fns.size()); for (const auto& i : c10::irange(grad_fns.size())) { auto grad_fn = grad_fns[i]; if (grad_fn != nullptr) { rebase_history(differentiable_outputs[i], grad_fns[i]); } } } std::vector<c10::optional<at::Tensor>> self_new_fw_grad_opts(self.size(), c10::nullopt); for (const auto& i : c10::irange(self_new_fw_grad_opts.size())) { if (_any_has_forward_grad_self[i]) { auto self_t_raw = toNonOptFwGrad(self[i]); auto self_tensor = toNonOptTensor(self[i]); auto self_t = (self_t_raw.defined() || !self_tensor.defined()) ? self_t_raw : at::zeros(self_tensor.sizes(), self_tensor.options()); auto tensor1_t_raw = toNonOptFwGrad(tensor1[i]); auto tensor1_tensor = toNonOptTensor(tensor1[i]); auto tensor1_t = (tensor1_t_raw.defined() || !tensor1_tensor.defined()) ? tensor1_t_raw : at::_efficientzerotensor(tensor1_tensor.sizes(), tensor1_tensor.options()); auto tensor1_p = toNonOptPrimal(tensor1[i]); auto tensor2_t_raw = toNonOptFwGrad(tensor2[i]); auto tensor2_tensor = toNonOptTensor(tensor2[i]); auto tensor2_t = (tensor2_t_raw.defined() || !tensor2_tensor.defined()) ? tensor2_t_raw : at::_efficientzerotensor(tensor2_tensor.sizes(), tensor2_tensor.options()); auto tensor2_p = toNonOptPrimal(tensor2[i]); self_t = GradMode::is_enabled() ? self_t.clone() : self_t; self_new_fw_grad_opts[i] = self_t_raw.defined() ? self_t_raw.copy_(self_t + maybe_multiply(tensor1_t * tensor2_p, scalars[i]) + maybe_multiply(tensor2_t * tensor1_p, scalars[i])) : self_t + maybe_multiply(tensor1_t * tensor2_p, scalars[i]) + maybe_multiply(tensor2_t * tensor1_p, scalars[i]); } } for (const auto& i : c10::irange(self_new_fw_grad_opts.size())) { auto& self_new_fw_grad_opt = self_new_fw_grad_opts[i]; if (self_new_fw_grad_opt.has_value() && self_new_fw_grad_opt.value().defined() && self[i].defined()) { // The hardcoded 0 here will need to be updated once we support multiple levels. self[i]._set_fw_grad(self_new_fw_grad_opt.value(), /* level */ 0, /* is_inplace_op */ true); } } } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/100695 Approved by: https://github.com/soulitzer |
||
|
|
ba2bc7df8f |
Enable backward on _foreach_zero_ (#101149)
Currently torchgen cannot find an appropriate `DifferentiabilityInfo` for `_foreach_zero_` because `gen_foreach_derivativeinfo` doesn't correctly make use of `functional_info_by_signature` and `differentiability_infos`, and `is_reference_for_foreach` a bit too strict to `_foreach_zero_`.
Generated code in `VariableType`
```c++
void _foreach_zero_(c10::DispatchKeySet ks, at::TensorList self) {
auto self_ = unpack(self, "self", 0);
[[maybe_unused]] auto _any_requires_grad = compute_requires_grad( self );
std::vector<c10::optional<at::Tensor>> original_selfs(self.size());
std::vector<std::shared_ptr<ZeroBackward0>> grad_fns;
if (_any_requires_grad) {
for (const auto& i : c10::irange( self.size() )) {
const auto ith_requires_grad = compute_requires_grad(self[i]);
check_inplace(self[i], ith_requires_grad);
grad_fns.push_back([&]() -> std::shared_ptr<ZeroBackward0> {
if (!ith_requires_grad) {
return nullptr;
} else {
auto grad_fn = std::shared_ptr<ZeroBackward0>(new ZeroBackward0(), deleteNode);
grad_fn->set_next_edges(collect_next_edges( self[i] ));
return grad_fn;
}
}());
}
}
#ifndef NDEBUG
std::vector<c10::optional<Storage>> self__storage_saved(self_.size());
for (const Tensor& tensor : self_)
self__storage_saved.push_back(
tensor.has_storage() ? c10::optional<Storage>(tensor.storage()) : c10::nullopt);
std::vector<c10::intrusive_ptr<TensorImpl>> self__impl_saved(self_.size());
for (size_t i=0; i<self_.size(); i++)
if (self_[i].defined()) self__impl_saved[i] = self_[i].getIntrusivePtr();
#endif
{
at::AutoDispatchBelowAutograd guard;
at::redispatch::_foreach_zero_(ks & c10::after_autograd_keyset, self_);
}
#ifndef NDEBUG
for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) {
if (self__storage_saved[i].has_value() && !at::impl::tensorlist_has_dispatch(self_))
TORCH_INTERNAL_ASSERT(self__storage_saved[i].value().is_alias_of(self_[i].storage()));
}
for (size_t i=0; i<self_.size() && !at::impl::dispatch_mode_enabled(); i++) {
if (self__impl_saved[i] && !at::impl::tensorlist_has_dispatch(self_))
TORCH_INTERNAL_ASSERT(self__impl_saved[i] == self_[i].getIntrusivePtr());
}
#endif
if (!grad_fns.empty()) {
auto differentiable_outputs = flatten_tensor_args( self );
TORCH_INTERNAL_ASSERT(differentiable_outputs.size() == grad_fns.size());
for (const auto& i : c10::irange(grad_fns.size())) {
auto grad_fn = grad_fns[i];
if (grad_fn != nullptr) {
rebase_history(differentiable_outputs[i], grad_fns[i]);
}
}
}
}
```
Rel:
- #58833
- #96405
Pull Request resolved: https://github.com/pytorch/pytorch/pull/101149
Approved by: https://github.com/soulitzer
|
||
|
|
6c934a89a7 |
Skip invalid grads in outplace foreachs' backward (#100256)
Fixes #100248 Pull Request resolved: https://github.com/pytorch/pytorch/pull/100256 Approved by: https://github.com/soulitzer, https://github.com/albanD |
||
|
|
674018903d |
per-Tensor grad_fn for in-place foreach functions (#96405)
Generate a `grad_fn` for each (tuple of) `Tensor`(s) of the same index for `_foreach_foo_` and each `grad_fn` is `FooBackward`. The current status of foreach functions' backward support for the record: - out-place: Implemented, but no optimized implementations like their forward path - in-place: not implemented. I think this check |
||
|
|
13ca08435c |
[test_foreach] add cases of zero size tensors (#95028)
supply zero-size tensors only if multi_tensor_apply_kernel would be called w.h.p, i.e. device is cuda and dtype is float32 rel: - https://github.com/pytorch/pytorch/pull/94655 - https://github.com/pytorch/pytorch/issues/94865 Pull Request resolved: https://github.com/pytorch/pytorch/pull/95028 Approved by: https://github.com/ngimel |
||
|
|
a48d518e45 |
test_foreach: remove skipMeta (#96599)
Happened to notice that the test doesn't seem to require the guard (at least on my local environment) Pull Request resolved: https://github.com/pytorch/pytorch/pull/96599 Approved by: https://github.com/bdhirsh |
||
|
|
f54233e273 |
[foreach] bump tensor's version and define backward via torchgen (as possible) (#93901)
## summary - increment tensor versions in inplace foreach functions - add a logic to take care of `ArrayRef<Scalar>` rel: https://github.com/pytorch/pytorch/issues/58833, https://github.com/pytorch/pytorch/pull/89591 Pull Request resolved: https://github.com/pytorch/pytorch/pull/93901 Approved by: https://github.com/albanD |
||
|
|
3e9df622fb |
[mta] implement _foreach_pow (#92303)
Mainly for foreach path of `Adam` and `AdamW` rel: https://github.com/pytorch/pytorch/issues/58833 Pull Request resolved: https://github.com/pytorch/pytorch/pull/92303 Approved by: https://github.com/albanD |
||
|
|
67d9790985 |
[BE] Apply almost all remaining flake8-comprehension checks (#94676)
Applies the remaining flake8-comprehension fixes and checks. This changes replace all remaining unnecessary generator expressions with list/dict/set comprehensions which are more succinct, performant, and better supported by our torch.jit compiler. It also removes useless generators such as 'set(a for a in b)`, resolving it into just the set call. Pull Request resolved: https://github.com/pytorch/pytorch/pull/94676 Approved by: https://github.com/ezyang |
||
|
|
30876229a7 |
[mta] Backward of unary foreach functions (#89591)
as per title, this PR defines backward of those.
This doesn't implement forward-mode automatic differentiation as [the current codegen](
|
||
|
|
32b2d8009a |
check if multi_tensor_apply_kernel was called (#92077)
Replacing all the hard coded number of cuda kernel launches with `multi_tensor_apply_kernel` call check, keeping the dependency on kineto profiler there Rel: https://github.com/pytorch/pytorch/pull/91844#issuecomment-1379844523 Pull Request resolved: https://github.com/pytorch/pytorch/pull/92077 Approved by: https://github.com/ngimel |
||
|
|
a76bc410df |
Fix _foreach_norm on some tensor sizes (#91844)
This PR fixes 2 bugs with CUDA `_foreach_norm`: 1. Wrong norm when tensors are larger than kChunkSize = 65536 ``` >>> torch._foreach_norm([torch.ones(60000, device="cuda") for _ in range(1)]) (tensor(244.9490, device='cuda:0', grad_fn=<NotImplemented>),) >>> torch._foreach_norm([torch.ones(70000, device="cuda") for _ in range(1)]) (tensor(256., device='cuda:0', grad_fn=<NotImplemented>),) >>> torch.ones(60000, device="cuda").norm() tensor(244.9490, device='cuda:0', grad_fn=<LinalgVectorNormBackward0>) >>> torch.ones(70000, device="cuda").norm() tensor(264.5751, device='cuda:0', grad_fn=<LinalgVectorNormBackward0>) ``` 2. Error when a tensor numel is smaller than the number of tensors ``` >> torch._foreach_norm([torch.ones(9, device="cuda") for _ in range(10)]) Traceback (most recent call last): File "<stdin>", line 1, in <module> IndexError: select(): index 9 out of range for tensor of size [9] at dimension 0 ``` This bug could have been caught by tests if `PYTORCH_TEST_WITH_SLOW` was 1, because it would have tested tensors of size 300*300=90000. It's not enabled by default, does someone know if it's ever enabled? Pull Request resolved: https://github.com/pytorch/pytorch/pull/91844 Approved by: https://github.com/ngimel |
||
|
|
554a796aef |
Implement torch._foreach_lerp (#87562)
As per title. - [ ] ~~Q: Do we want `torch._foreach_lerp.ScalarList` as well?~~ - [ ] ~~we might want to have `ATen/native/cuda/lerp.cuh` and include it in `ATen/native/cuda/Lerp.cu` and `ATen/native/cuda/ForeachTernaryOp.cu`~~ Related: - https://github.com/pytorch/pytorch/issues/58833 - https://github.com/pytorch/pytorch/issues/71683 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87562 Approved by: https://github.com/ngimel |
||
|
|
9d20d6d5ec |
Foreach clamp_min clamp_max (#91384)
Adds `_foreach_clamp_min` and `_foreach_clamp_max` as binary ops, with scalar, scalarlist and tensorlist support.
Timing example for `_foreach_clamp_min_` on a GTX3070Ti across a list of tensors with varying count and item size (times are in microseconds (us)):
CUDA:
```
[------------------ (tensors, scalar) -------------------]
| for loop | foreach
10 tensors of size 4 | 29.0 | 10.2
100 tensors of size 4 | 234.4 | 18.3
1000 tensors of size 4 | 2194.1 | 113.5
10000 tensors of size 4 | 21745.6 | 1144.5
10 tensors of size 16 | 29.5 | 12.0
100 tensors of size 16 | 256.9 | 19.9
1000 tensors of size 16 | 2499.7 | 123.6
10000 tensors of size 16 | 25022.2 | 1295.6
10 tensors of size 256 | 32.8 | 11.2
100 tensors of size 256 | 258.8 | 19.7
1000 tensors of size 256 | 2509.2 | 123.7
10000 tensors of size 256 | 25016.2 | 1295.4
10 tensors of size 65536 | 32.9 | 18.7
100 tensors of size 65536 | 327.1 | 150.3
1000 tensors of size 65536 | 3051.3 | 1388.0
10000 tensors of size 65536 | 30476.9 | 14021.5
[------------------ (tensors, tensors) ------------------]
| for loop | foreach
10 tensors of size 4 | 26.8 | 17.3
100 tensors of size 4 | 206.8 | 90.5
1000 tensors of size 4 | 1993.0 | 828.9
10000 tensors of size 4 | 19851.0 | 9063.3
10 tensors of size 16 | 34.7 | 20.0
100 tensors of size 16 | 232.2 | 102.1
1000 tensors of size 16 | 2220.9 | 977.3
10000 tensors of size 16 | 22644.5 | 10361.4
10 tensors of size 256 | 30.5 | 19.7
100 tensors of size 256 | 231.6 | 102.4
1000 tensors of size 256 | 2251.9 | 978.7
10000 tensors of size 256 | 22680.3 | 10405.8
10 tensors of size 65536 | 30.6 | 34.4
100 tensors of size 65536 | 315.1 | 223.6
1000 tensors of size 65536 | 3252.1 | 2114.4
10000 tensors of size 65536 | 30578.0 | 22826.3
```
CPU:
```
[------------------- (tensors, scalar) -------------------]
| for loop | foreach
10 tensors of size 4 | 13.0 | 9.6
100 tensors of size 4 | 62.4 | 31.6
1000 tensors of size 4 | 562.2 | 245.6
10000 tensors of size 4 | 5552.2 | 2517.7
10 tensors of size 16 | 14.9 | 11.3
100 tensors of size 16 | 74.1 | 36.9
1000 tensors of size 16 | 663.7 | 285.5
10000 tensors of size 16 | 6765.2 | 2947.5
10 tensors of size 256 | 15.2 | 11.8
100 tensors of size 256 | 76.0 | 37.7
1000 tensors of size 256 | 728.8 | 323.9
10000 tensors of size 256 | 7274.4 | 3800.3
10 tensors of size 65536 | 105.6 | 124.5
100 tensors of size 65536 | 982.8 | 939.7
1000 tensors of size 65536 | 14993.1 | 14579.2
10000 tensors of size 65536 | 163091.0 | 151555.8
[------------------- (tensors, tensors) ------------------]
| for loop | foreach
10 tensors of size 4 | 11.8 | 10.5
100 tensors of size 4 | 53.1 | 38.2
1000 tensors of size 4 | 465.1 | 316.1
10000 tensors of size 4 | 4616.9 | 3625.9
10 tensors of size 16 | 13.5 | 12.3
100 tensors of size 16 | 63.0 | 46.5
1000 tensors of size 16 | 560.1 | 359.9
10000 tensors of size 16 | 5586.8 | 3765.9
10 tensors of size 256 | 15.2 | 13.7
100 tensors of size 256 | 64.4 | 48.3
1000 tensors of size 256 | 653.7 | 410.0
10000 tensors of size 256 | 5916.6 | 3901.3
10 tensors of size 65536 | 109.1 | 106.8
100 tensors of size 65536 | 1128.9 | 1105.0
1000 tensors of size 65536 | 16245.0 | 15950.8
10000 tensors of size 65536 | 171111.3 | 163540.2
```
Example use:
```
tensors = [torch.randn(16, device='cuda') for _ in range(10)]
out = torch._foreach_clamp_min(tensors, 0.1)
out = torch._foreach_clamp_min(tensors, [0.1] * len(tensors))
out = torch._foreach_clamp_min(tensors, tensors)
torch._foreach_clamp_min_(tensors, 0.1)
torch._foreach_clamp_min_(tensors, [0.1] * len(tensors))
torch._foreach_clamp_min_(tensors, tensors)
```
Does not support complex types.
Changes the existing `foreach_minimum/maximum` to use this new implementation.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/91384
Approved by: https://github.com/ngimel
|
||
|
|
6fd416650a |
Add _foreach_addc(div/mul)(_).Tensor (#88157)
Support passing value scalars as a flat 1D Tensor. Currently we can only pass either an individual scalar or a ScalarList. Pull Request resolved: https://github.com/pytorch/pytorch/pull/88157 Approved by: https://github.com/ngimel, https://github.com/albanD |
||
|
|
f701cb04fb |
Test Dynamo CI w Fake Tensors (#84282)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/84282 Approved by: https://github.com/anijain2305 |
||
|
|
3139722679 |
[foreach][mta] Inplace maximum and minimum (#82523)
### Description <!-- What did you change and why was it needed? --> Implement `torch._foreach_maximum_` and `torch._foreach_minimum_` mainly for `_multi_tensor_adam` and `_multi_tensor_adamw` with `amsgrad=True` to correctly update their `max_exp_avg_sqs`. ### Issue <!-- Link to Issue ticket or RFP --> - https://github.com/pytorch/pytorch/issues/78807 - https://github.com/pytorch/pytorch/pull/81894 - https://github.com/pytorch/pytorch/pull/81348 - https://github.com/pytorch/pytorch/pull/81705 - https://github.com/pytorch/pytorch/issues/58833 - https://github.com/pytorch/pytorch/issues/68041 ### Testing <!-- How did you test your change? --> Updated `test_foreach.py::TestForeach::_minmax_test` to compare the outputs of `_foreach_maximum_` (and `_foreach_minimum_`) against those of `[torch.maximum(a, b) for a, b in zip(tensors1, tensors2)]` cc @ngimel @albanD @mikaylagawarecki Pull Request resolved: https://github.com/pytorch/pytorch/pull/82523 Approved by: https://github.com/albanD |
||
|
|
bfac65dfe5
|
[testing] Update dispatch macros (#74977)
This PR is reland of #74289 Co-authored-by: Khushi Agrawal <khushiagrawal411@gmail.com> |
||
|
|
2e4152b118 |
Revert "[testing] Update dispatch macros"
This reverts commit
|
||
|
|
eed19a0f38 |
[testing] Update dispatch macros
Hi, This PR is the follow-up PR of #71561. (the previous PR had a couple of merge conflicts and was reverted, this PR resolves that). Please take a look. Thanks! cc: @pmeier @mruberry @kshitij12345 Pull Request resolved: https://github.com/pytorch/pytorch/pull/74289 Approved by: https://github.com/pmeier, https://github.com/mruberry |
||
|
|
105e58a552 |
[Foreach Reduction] Use OpMathType tensor for intermediate results
Follow-up of https://github.com/pytorch/pytorch/pull/62646
In APEX, multi_tensor_norm only supports float and half and the dtype of `output` and `output_per_tensor` is hardcoded as single-precision (see
|
||
|
|
1e08448435 |
[ROCm] enable foreach fastpath
Reverts #46216 now that rocm is fixed. Benchmark to verify: ```python import torch import time import torch.optim as optim from torch.autograd import Variable from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, StepLR import torch.nn as nn import time import torchvision import torch.utils.benchmark as benchmark_utils device = "cuda" model = torchvision.models.resnet.resnet101(pretrained=True).to(device) targets = torch.randint(0, 1000, (100, 100), device=device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.1) # <----------------------- optimizer. # would compare optim.SGD vs optim._multi_tensor.SGD optimizer_mta = optim._multi_tensor.SGD(model.parameters(), lr=1e-3, momentum=0.1) running_loss = 0.0 target = torch.empty(128, dtype=torch.long, device=device).random_(5) optimizer.zero_grad() inputs = torch.rand(128, 3, 100, 100, device=device , requires_grad=True) outputs = model(inputs) loss = criterion(outputs, target) loss.backward() optimizer.step() running_loss += loss.item() def main(): timer = benchmark_utils.Timer( stmt="torch.cuda.synchronize();optimizer.step()", globals=globals(), label="str(optimizer)", ) timer_mta = benchmark_utils.Timer( stmt="torch.cuda.synchronize(); optimizer_mta.step()", globals=globals(), label="str(optimizer_mta)", ) for _ in range(1): for i in range(1): print(f"Run: {i}\n{'-' * 40}") print(f"autorange:\n{timer.blocked_autorange()}\n\n") for i in range(1): print(f"Run: {i}\n{'-' * 40}") print(f"autorange:\n{timer_mta.blocked_autorange()}\n\n") if __name__ == "__main__": main() ``` Before revert: ``` Run: 0 ---------------------------------------- autorange: <torch.utils.benchmark.utils.common.Measurement object at 0x7f253e67c910> str(optimizer) 7.33 ms 1 measurement, 100 runs , 1 thread Run: 0 ---------------------------------------- autorange: <torch.utils.benchmark.utils.common.Measurement object at 0x7f253e67c510> str(optimizer_mta) 5.76 ms 1 measurement, 100 runs , 1 thread ``` After revert: ``` Run: 0 ---------------------------------------- autorange: <torch.utils.benchmark.utils.common.Measurement object at 0x7fa2aa15e8d0> str(optimizer) 7.35 ms 1 measurement, 100 runs , 1 thread Run: 0 ---------------------------------------- autorange: <torch.utils.benchmark.utils.common.Measurement object at 0x7fa2aa15e4d0> str(optimizer_mta) 3.53 ms 1 measurement, 100 runs , 1 thread ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/74417 Approved by: https://github.com/ngimel |
||
|
|
ef066f0832 |
Revert D34856571: [pytorch][PR] Replace get_all_ type macros with the ATen dispatch macros.
Test Plan: revert-hammer Differential Revision: D34856571 ( |
||
|
|
3ded7b1da3 |
Replace get_all_ type macros with the ATen dispatch macros. (#71561)
Summary: Hi, Team! The PR is motivated from https://github.com/pytorch/pytorch/pull/71153#discussion_r782446738. It aims to replace `get_all` type macros with the ATen dispatch macros. The files it iterates over are: (Thanks, Lezcano, for the idea!!) <details> <summary> `test/test_autograd.py`</summary> <p> ```python 43:from torch.testing._internal.common_dtype import get_all_dtypes 8506: floating_dt = [dt for dt in get_all_dtypes() if dt.is_floating_point] ``` </p> </details> <details> <summary> `test/test_binary_ufuncs.py`</summary> <p> ```python 26: all_types_and_complex_and, integral_types_and, get_all_dtypes, get_all_int_dtypes, get_all_math_dtypes, 27: get_all_complex_dtypes, get_all_fp_dtypes, 935: dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) 1035: dtypes(*get_all_dtypes( 1488: dtypes(*(get_all_dtypes(include_bool=False, include_bfloat16=False))) 1879: dtypes(*product(get_all_dtypes(include_complex=False), get_all_dtypes(include_complex=False))) 1887: dtypes(*(get_all_int_dtypes() + [torch.bool])) 1913: dtypes(*(get_all_fp_dtypes())) 1941: dtypes(*(get_all_fp_dtypes())) 1977: dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) 2019: dtypes(*product(get_all_fp_dtypes(), get_all_fp_dtypes())) 2048: dtypes(*get_all_dtypes()) 2110: dtypes(*product(get_all_dtypes(include_complex=False), 2111: get_all_dtypes(include_complex=False))) 2128: types = [torch.bool, torch.bfloat16] + get_all_int_dtypes() 2173: if dtypes[1] in get_all_fp_dtypes(): 2178: dtypes(*product(get_all_fp_dtypes(), 2179: get_all_fp_dtypes())) 2260: dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) 2261: dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) 2273: dtypesIfCUDA(*set(get_all_math_dtypes('cuda')) - {torch.complex64, torch.complex128}) 2274: dtypes(*set(get_all_math_dtypes('cpu')) - {torch.complex64, torch.complex128}) 2307: dtypes(*get_all_math_dtypes('cpu')) 2319: dtypes(*get_all_fp_dtypes(include_bfloat16=False)) 2331: dtypes(*get_all_int_dtypes()) 2356: dtypes(*get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) 2393: if dtype in get_all_int_dtypes(): 2614: dtypes(*get_all_dtypes()) 2624: dtypes(*tuple(itertools.combinations_with_replacement(get_all_dtypes(), 2))) 2806: dtypes(*list(product(get_all_dtypes(include_complex=False), 2807: get_all_dtypes(include_complex=False)))) 2866: dtypes(*list(product(get_all_complex_dtypes(), 2867: get_all_complex_dtypes()))) 2902: dtypes(*product(get_all_dtypes(), get_all_dtypes())) 2906: dtypes(*product(get_all_dtypes(), get_all_dtypes())) 2910: dtypes(*product(get_all_dtypes(), get_all_dtypes())) 3019: dtypes = [torch.float, torch.double] + get_all_complex_dtypes() 3221: dtypes(*get_all_dtypes(include_complex=False)) 3407: dtypes(*list(product(get_all_dtypes(include_bool=False), 3408: get_all_dtypes(include_bool=False)))) 3504: dtypes(*product(get_all_dtypes(include_complex=False, include_bfloat16=False), 3505: get_all_dtypes(include_complex=False, include_bfloat16=False))) 3516: if x.dtype in get_all_int_dtypes() + [torch.bool]: 3643: dtypes(*product(get_all_dtypes(include_complex=False, 3645: get_all_dtypes(include_complex=False, ``` </p> </details> <details> <summary> `test/test_complex.py`</summary> <p> ```python 6:from torch.testing._internal.common_dtype import get_all_complex_dtypes 11: dtypes(*get_all_complex_dtypes()) ``` </p> </details> <details> <summary> `test/test_foreach.py`</summary> <p> ```python 18: get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes, 142: if dtype in get_all_int_dtypes(): 179: disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] 201: disable_fastpath = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] 205: disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool] 211: disable_fastpath |= dtype not in get_all_complex_dtypes() 241: bool_int_div = op.ref == torch.div and dtype in get_all_int_dtypes() + [torch.bool] 246: disable_fastpath |= dtype in get_all_int_dtypes() + [torch.bool] 248: disable_fastpath |= dtype not in get_all_complex_dtypes() 250: disable_fastpath |= True and dtype not in get_all_complex_dtypes() 307: disable_fastpath = dtype in get_all_int_dtypes() + [torch.bool] 365: if opinfo.name == "_foreach_abs" and dtype in get_all_complex_dtypes(): 376: ops(foreach_unary_op_db, dtypes=get_all_dtypes()) 393: dtypes=get_all_dtypes(include_half=True, include_bfloat16=True, include_complex=False)) 401: ops(foreach_minmax_op_db, dtypes=get_all_fp_dtypes(include_bfloat16=True, include_half=True)) 426: if ord in (1, 2) and dtype in torch.testing.get_all_fp_dtypes(): 439: dtypes(*get_all_dtypes()) 449: ops(foreach_binary_op_db, dtypes=get_all_dtypes()) 481: ops(foreach_binary_op_db, dtypes=get_all_dtypes()) 536: if dtype in get_all_int_dtypes() + [torch.bool] and foreach_op == torch._foreach_div: 545: ops(foreach_binary_op_db, dtypes=get_all_dtypes()) 637: ops(foreach_pointwise_op_db, allowed_dtypes=get_all_fp_dtypes(include_half=False, include_bfloat16=False)) ``` </p> </details> <details> <summary> `test/test_linalg.py`</summary> <p> ```python 29: all_types, floating_types, floating_and_complex_types, get_all_dtypes, get_all_int_dtypes, get_all_complex_dtypes, 30: get_all_fp_dtypes, 111: dtypes(*(get_all_dtypes())) 794: float_and_complex_dtypes = get_all_fp_dtypes() + get_all_complex_dtypes() 807: dtypes(*(get_all_int_dtypes())) 828: dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) 841: if dtype in get_all_complex_dtypes(): 844: dtypes(*itertools.product(get_all_dtypes(), 845: get_all_dtypes())) 855: for dtypes0, dtypes1, dtypes2 in product(get_all_dtypes(), repeat=3): 5607: *get_all_fp_dtypes(include_half=not CUDA9, include_bfloat16=(CUDA11OrLater and SM53OrLater))) 5608: dtypes(*(set(get_all_dtypes()) - {torch.half, torch.bool})) 5644: dtypes(*(get_all_complex_dtypes() + get_all_fp_dtypes())) 6255: dtypesIfCUDA(*get_all_complex_dtypes(), 6256: *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)), 6292: dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) 6323: dtypesIfCUDA(*get_all_complex_dtypes(), 6324: *get_all_fp_dtypes(include_bfloat16=(TEST_WITH_ROCM or (CUDA11OrLater and SM53OrLater)))) 6325: dtypes(*get_all_complex_dtypes(), *get_all_fp_dtypes()) 6358: dtypesIfCUDA(*([torch.float, torch.double] + get_all_complex_dtypes())) 6556: dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) 6668: dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) 6741: dtypes(*get_all_fp_dtypes(), *get_all_complex_dtypes()) ``` </p> </details> <details> <summary> `test/test_nn.py`</summary> <p> ```python 37:from torch.testing._internal.common_dtype import integral_types, get_all_fp_dtypes, get_all_math_dtypes 50: onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, skipMeta, get_all_device_types, \ 8862: for device in get_all_device_types(): 9629: for dt1 in get_all_math_dtypes(device): 9630: for dt2 in get_all_math_dtypes(device): 9631: for dt3 in get_all_math_dtypes(device): 9648: for input_dtype in get_all_math_dtypes(device): 9664: for input_dtype in get_all_math_dtypes(device): 13015: dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) 13034: dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) 13159: dtypes(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) 17400: dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) 17768: dtypesIfCUDA(*get_all_fp_dtypes()) 17773: dtypesIfCUDA(*get_all_fp_dtypes()) 17778: dtypesIfCUDA(*get_all_fp_dtypes()) 17783: dtypesIfCUDA(*get_all_fp_dtypes()) 17788: dtypesIfCUDA(*get_all_fp_dtypes()) 17793: dtypesIfCUDA(*get_all_fp_dtypes()) 17798: dtypesIfCUDA(*get_all_fp_dtypes()) 17963: dtypesIfCUDA(*get_all_fp_dtypes()) 17977: dtypesIfCUDA(*get_all_fp_dtypes()) 18684: def test_cross_entropy_loss_prob_target_all_reductions(self, device): ``` </p> </details> <details> <summary> `test/test_numpy_interop.py`</summary> <p> ```python 12:from torch.testing._internal.common_dtype import get_all_dtypes 399: dtypes(*get_all_dtypes()) ``` </p> </details> <details> <summary> `test/test_ops.py`</summary> <p> ```python 12:from torch.testing._internal.common_dtype import floating_and_complex_types_and, get_all_dtypes 86: for dtype in get_all_dtypes(): ``` </p> </details> <details> <summary> `test/test_reductions.py`</summary> <p> ```python 16: get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_complex_dtypes, get_all_fp_dtypes, 360: allowed_dtypes=get_all_dtypes(include_bfloat16=False)) 366: allowed_dtypes=get_all_dtypes(include_bfloat16=False)) 394: allowed_dtypes=get_all_dtypes(include_bfloat16=False)) 750: for dtype in [dtype for dtype in get_all_math_dtypes('cpu') if dtype != torch.float16]: 1404: dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) 1457: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + 1458: get_all_complex_dtypes())) 1465: return dtype in get_all_int_dtypes() 1494: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) 1501: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) 1507: dtypes(*(get_all_complex_dtypes())) 1514: dtypes = list(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False)) 1523: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False))) 1531: if dtype in get_all_fp_dtypes(): 1608: dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False, 1837: dtypes(*get_all_dtypes(include_bool=False, include_complex=False)) 1855: dtypes(*(set(get_all_dtypes(include_bool=False, include_complex=False)) - {torch.uint8})) 3219: for dtype in get_all_dtypes(include_half=True, include_bfloat16=False, ``` </p> </details> <details> <summary> `test/test_serialization.py`</summary> <p> ```python 26:from torch.testing._internal.common_dtype import get_all_dtypes 586: for device, dtype in product(devices, get_all_dtypes()): 589: for other_dtype in get_all_dtypes(): ``` </p> </details> <details> <summary> `test/test_shape_ops.py`</summary> <p> ```python 18:from torch.testing._internal.common_dtype import get_all_dtypes 230: dtypes(*get_all_dtypes(include_complex=False, include_bool=False, include_half=False, 232: dtypesIfCUDA(*get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=False)) 344: dtypes(*get_all_dtypes()) 443: dtypes(*get_all_dtypes()) 461: dtypes(*get_all_dtypes()) 570: dtypes(*get_all_dtypes(include_complex=False)) ``` </p> </details> <details> <summary> `test/test_sort_and_select.py`</summary> <p> ```python 12: all_types, all_types_and, floating_types_and, get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, 136: dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) 231: dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) 296: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 647: dtypesIfCUDA(*get_all_fp_dtypes()) 678: dtypesIfCUDA(*(get_all_dtypes(include_complex=False, 682: dtypes(*(get_all_dtypes(include_complex=False, include_bool=False, include_half=False, include_bfloat16=False))) 739: dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128}) 740: dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) 799: dtypesIfCPU(*set(get_all_dtypes()) - {torch.complex64, torch.complex128}) 800: dtypes(*set(get_all_dtypes()) - {torch.bfloat16, torch.complex64, torch.complex128}) ``` </p> </details> <details> <summary> `test/test_sparse.py`</summary> <p> ```python 20:from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes 29: floating_and_complex_types, floating_and_complex_types_and, get_all_dtypes, get_all_int_dtypes, 1963: return dtype in get_all_int_dtypes() 1994: dtypes(*get_all_dtypes(include_bool=False, include_half=False, 2103: return dtype in get_all_int_dtypes() 2138: dtypes(*get_all_dtypes(include_bool=False, include_half=False, 2626: all_sparse_dtypes = get_all_dtypes(include_complex=True) 2633: all_sparse_dtypes = get_all_dtypes(include_complex=True) 3230: dtypes(*get_all_complex_dtypes(), 3231: *get_all_fp_dtypes(include_half=False, include_bfloat16=False)) 3234: *get_all_fp_dtypes( ``` </p> </details> <details> <summary> `test/test_sparse_csr.py`</summary> <p> ```python 7:from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes, floating_and_complex_types, make_tensor 17:from torch.testing._internal.common_dtype import floating_types, get_all_dtypes 120: dtypes(*get_all_dtypes()) 133: dtypes(*get_all_dtypes()) 150: dtypes(*get_all_dtypes()) 180: dtypes(*get_all_dtypes()) 201: dtypes(*get_all_dtypes()) 210: dtypes(*get_all_dtypes()) 225: dtypes(*get_all_dtypes()) 244: dtypes(*get_all_dtypes()) 263: dtypes(*get_all_dtypes()) 285: dtypes(*get_all_dtypes()) 411: dtypes(*get_all_dtypes()) 482: dtypes(*get_all_dtypes()) 502: dtypes(*get_all_dtypes()) 562: dtypes(*get_all_dtypes()) 588: dtypesIfCUDA(*get_all_complex_dtypes(), 589: *get_all_fp_dtypes(include_half=SM53OrLater, include_bfloat16=SM80OrLater)) 745: dtypesIfCUDA(*get_all_complex_dtypes(), 746: *get_all_fp_dtypes(include_half=SM53OrLater and TEST_CUSPARSE_GENERIC, 765: dtypesIfCUDA(*get_all_complex_dtypes(), 766: *get_all_fp_dtypes(include_half=SM53OrLater and TEST_CUSPARSE_GENERIC, 801: *torch.testing.get_all_fp_dtypes(include_bfloat16=SM80OrLater, 841: *torch.testing.get_all_fp_dtypes(include_bfloat16=SM80OrLater, 1182: dtypes(*get_all_dtypes()) 1276: dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_bfloat16=False)) 1286: dtypes(*get_all_dtypes()) ``` </p> </details> <details> <summary> `test/test_tensor_creation_ops.py`</summary> <p> ```python 21: onlyCUDA, skipCPUIf, dtypesIfCUDA, skipMeta, get_all_device_types) 23: get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes 150: for dt in get_all_dtypes(): 160: for dt in get_all_dtypes(): 314: dtypes = [dtype for dtype in get_all_dtypes() if dtype != torch.bfloat16] 1012: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + 1013: get_all_complex_dtypes())) 1032: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + 1033: get_all_complex_dtypes())) 1050: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + 1051: get_all_complex_dtypes())) 1745: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 1779: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 1868: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 1926: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 1954: do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device) 1956: do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, None) 1957: do_test_empty_full(self, get_all_math_dtypes('cpu'), torch.strided, torch_device) 2538: for device in get_all_device_types(): 2645: for dtype in get_all_dtypes(): 2678: dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False) + 2679: get_all_complex_dtypes())) 2716: dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) 2827: for dt in get_all_dtypes(): 2913: dtypes(*get_all_dtypes(include_bool=False, include_half=False)) 2914: dtypesIfCUDA(*get_all_dtypes(include_bool=False, include_half=True)) 3028: dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) 3033: dtypes(*(get_all_fp_dtypes() + get_all_complex_dtypes())) 3074: dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) 3075: dtypesIfCUDA(*((get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) 3077: else get_all_dtypes(include_bool=False, include_half=True, include_complex=False))) 3873: dtypes(*get_all_dtypes()) 3884: dtypes(*get_all_dtypes(include_bool=False)) 3916: for other in get_all_dtypes(): 3922: dtypes(*get_all_dtypes()) 3932: dtypes(*get_all_dtypes(include_bool=False)) 3955: dtypes(*get_all_dtypes(include_bool=False)) 3961: dtypes(*get_all_dtypes(include_bool=False)) 3965: dtypes(*get_all_dtypes()) ``` </p> </details> <details> <summary> `test/test_testing.py`</summary> <p> ```python 25:from torch.testing._internal.common_dtype import get_all_dtypes 31: dtypes(*(get_all_dtypes(include_half=True, include_bfloat16=False, ``` </p> </details> <details> <summary> `test/test_torch.py`</summary> <p> ```python 51: expectedAlertNondeterministic, get_all_device_types, skipXLA) 57: get_all_fp_dtypes, get_all_int_dtypes, get_all_math_dtypes, get_all_dtypes, get_all_complex_dtypes 296: for d in get_all_device_types(): 323: for device in get_all_device_types(): 324: for dt1 in get_all_dtypes(): 325: for dt2 in get_all_dtypes(): 343: all_dtypes = get_all_dtypes() 350: all_dtypes = get_all_dtypes() 781: for dtype in get_all_dtypes(): 986: for device in get_all_device_types(): 1017: for device in get_all_device_types(): 1018: for dtype in get_all_math_dtypes(device): 2792: for device in get_all_device_types(): 3186: dtypes(*get_all_dtypes()) 3195: for error_dtype in get_all_dtypes(): 3203: dtypes(*get_all_dtypes()) 3212: for error_dtype in get_all_dtypes(): 4539: dtypes(*get_all_fp_dtypes()) 4545: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 4577: dtypes(*get_all_fp_dtypes(include_half=False, include_bfloat16=False)) 4578: dtypesIfCPU(*(get_all_fp_dtypes(include_half=False, include_bfloat16=True))) 4579: dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False))) 4599: dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False))) 4600: dtypesIfCPU(*(get_all_dtypes(include_half=False, include_bfloat16=False, include_complex=False))) 4601: dtypesIfCUDA(*(get_all_dtypes(include_bfloat16=False, include_complex=False))) 4613: for p_dtype in get_all_fp_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False): 4628: dtypes(*(get_all_fp_dtypes(include_half=False, include_bfloat16=False))) 4629: dtypesIfCUDA(*(get_all_fp_dtypes(include_bfloat16=False))) 4640: dtypes(*get_all_fp_dtypes()) 4723: dtypes(*get_all_fp_dtypes()) 4735: dtypes(*get_all_fp_dtypes(include_bfloat16=False)) 4736: dtypesIfCUDA(*get_all_fp_dtypes()) 4747: dtypes(*get_all_fp_dtypes()) 4761: dtypes(*get_all_fp_dtypes()) 4771: dtypes(*get_all_fp_dtypes()) 4792: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 5302: dtypes(*get_all_dtypes(include_bfloat16=False)) 5322: dtypes(*get_all_dtypes(include_half=False, include_bfloat16=False)) 5323: dtypesIfCPU(*get_all_dtypes(include_bfloat16=False)) 5324: dtypesIfCUDA(*get_all_dtypes(include_bfloat16=False)) 5591: for dt in get_all_dtypes(): 5611: for dt in get_all_dtypes(): 5678: for dt in get_all_dtypes(): 5696: dtypesIfCUDA(*set(get_all_math_dtypes('cuda'))) 5697: dtypes(*set(get_all_math_dtypes('cpu'))) 5746: dtypes(*get_all_dtypes()) 5780: dtypes(*get_all_dtypes()) 5885: dtypes(*get_all_dtypes()) 5902: dtypes(*get_all_dtypes()) 5945: dtypes(*get_all_dtypes()) 5979: dtypes(*get_all_dtypes(include_bool=False)) 6049: dtypes(*get_all_dtypes(include_bool=False)) 6092: dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + 6093: get_all_complex_dtypes())) 6094: dtypesIfCPU(*get_all_dtypes()) 6095: dtypesIfCUDA(*get_all_dtypes()) 6122: dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + 6123: get_all_complex_dtypes())) 6124: dtypesIfCPU(*get_all_dtypes()) 6125: dtypesIfCUDA(*get_all_dtypes()) 6163: dtypes(*(get_all_fp_dtypes(include_bfloat16=False, include_half=False) + 6164: get_all_complex_dtypes())) 6165: dtypesIfCPU(*get_all_dtypes()) 6166: dtypesIfCUDA(*get_all_dtypes()) 6190: dtypes(*(get_all_complex_dtypes() + 6191: get_all_int_dtypes())) 6238: dtypes(*get_all_dtypes()) 6323: dtypes(*get_all_dtypes()) 6389: dtypes(*product(get_all_dtypes(), (torch.uint8, torch.bool))) 6699: dtypesIfCUDA(*set(get_all_math_dtypes('cuda'))) 6700: dtypes(*set(get_all_math_dtypes('cpu'))) 7452: dtypes(*get_all_dtypes(include_bool=False)) 7461: dtypes(*get_all_dtypes(include_bool=False)) 7477: dtypes(*get_all_dtypes(include_bool=False)) 7496: dtypes(*get_all_dtypes(include_bool=False)) 7538: dtypes(*get_all_dtypes(include_bool=False)) 8162: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() + 8163: get_all_complex_dtypes())) 8175: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes() + 8176: get_all_complex_dtypes())) ``` </p> </details> <details> <summary> `test/test_type_promotion.py`</summary> <p> ```python 14: get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes 187: for dtype in get_all_dtypes(): 262: dtypes1 = get_all_math_dtypes('cuda') 263: dtypes2 = get_all_math_dtypes(device) 339: dtypes(*itertools.product(get_all_dtypes(), get_all_dtypes())) 468: for dt1 in get_all_math_dtypes(device): 469: for dt2 in get_all_math_dtypes(device): 519: for dt1 in get_all_math_dtypes(device): 520: for dt2 in get_all_math_dtypes(device): 528: for dt in get_all_math_dtypes(device): 561: for dtype in get_all_dtypes(): 766: dtypes=get_all_math_dtypes(device)) 771: dtypes=get_all_math_dtypes(device)) 782: dtypes=get_all_math_dtypes(device)) 879: dtypes = get_all_dtypes(include_bfloat16=False) 898: dtypes = get_all_dtypes(include_bfloat16=False, include_bool=False) 965: dtypesIfCUDA(*itertools.product(get_all_dtypes(include_bfloat16=False, include_complex=False), 966: get_all_dtypes(include_bfloat16=False, include_complex=False))) 967: dtypes(*itertools.product(get_all_dtypes(include_half=False, include_bfloat16=False, 969: get_all_dtypes(include_half=False, include_bfloat16=False, 976: return dtype in get_all_int_dtypes() + [torch.bool] 979: return dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False) ``` </p> </details> <details> <summary> `test/test_unary_ufuncs.py`</summary> <p> ```python 24: floating_types_and, all_types_and_complex_and, floating_and_complex_types_and, get_all_dtypes, get_all_math_dtypes, 25: get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes 517: dtypes(*(get_all_int_dtypes() + [torch.bool] + 518: get_all_fp_dtypes(include_bfloat16=False))) 596: dtypes(*get_all_fp_dtypes(include_half=True, include_bfloat16=False)) 611: invalid_input_dtypes = get_all_int_dtypes() + \ 612: get_all_complex_dtypes() + \ 619: for dtype in get_all_fp_dtypes(include_half=True, include_bfloat16=False): 1048: dtypes(*get_all_math_dtypes('cpu')) 1182: dtypesIfCUDA(*get_all_fp_dtypes()) 1190: dtypesIfCUDA(*get_all_fp_dtypes()) 1205: dtypesIfCUDA(*get_all_fp_dtypes()) 1215: dtypesIfCUDA(*get_all_fp_dtypes()) 1307: dtypes(*(get_all_dtypes(include_bool=False))) 1349: dtypes(*(get_all_fp_dtypes(include_half=False) + 1350: get_all_complex_dtypes())) 1351: dtypesIfCUDA(*(get_all_fp_dtypes(include_half=True) + 1352: get_all_complex_dtypes())) ``` </p> </details> <details> <summary> `test/test_view_ops.py`</summary> <p> ```python 19: get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes 124: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 131: dtypes(*get_all_dtypes(include_bfloat16=False)) 213: for view_dtype in [*get_all_fp_dtypes(), *get_all_complex_dtypes()]: 220: dtypes(*get_all_dtypes()) 224: for view_dtype in get_all_dtypes(): 305: dtypes(*get_all_complex_dtypes(include_complex32=True)) 343: dtypes(*get_all_dtypes()) 354: dtypes(*get_all_dtypes()) 364: dtypes(*get_all_dtypes()) 374: dtypes(*get_all_dtypes()) 384: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) 395: dtypes(*get_all_complex_dtypes()) 426: dtypes(*get_all_complex_dtypes()) 451: dtypes(*product(get_all_complex_dtypes(), get_all_dtypes())) 1263: dtypes(*(torch.testing.get_all_dtypes())) 1279: dtypes(*(torch.testing.get_all_dtypes())) 1405: dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes(include_bfloat16=False) + 1406: get_all_complex_dtypes())) 1471: dtypes(*get_all_dtypes(include_bfloat16=False)) 1574: dtypes(*get_all_dtypes()) 1601: dtypes(*get_all_dtypes(include_bfloat16=False)) 1632: dtypes(*get_all_dtypes(include_bfloat16=False)) 1711: for dt in get_all_dtypes(): 1717: for dt in get_all_dtypes(): 1724: for dt in get_all_dtypes(): ``` </p> </details> I'm looking forward to your viewpoints. Thanks :) cc: mruberry kshitij12345 anjali411 Pull Request resolved: https://github.com/pytorch/pytorch/pull/71561 Reviewed By: samdow Differential Revision: D34856571 Pulled By: mruberry fbshipit-source-id: 0dca038bcad5cf69906245c496d2e61ac3876335 (cherry picked from commit b058f67b4313143efa714ab105f36e74083131b9) |
||
|
|
d4d0ab71b3 |
use torch.testing.assert_equal in TestCase.assertEqual (#67796)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/67796
Supersedes #58981.
cc mruberry
Test Plan: Imported from OSS
Reviewed By: ngimel
Differential Revision: D33542994
Pulled By: mruberry
fbshipit-source-id: 527099f5fdc154fd95ee48cd19f0a85eeec43443
(cherry picked from commit
|
||
|
|
61ea2fc35e |
Fix device type / dtype handling for parametrized test names (#65217)
Summary: This PR absolves `_TestParametrizer`s (e.g. `ops`, `modules`, `parametrize`) of the responsibility of adding device type (e.g. `'cpu'`, `'cuda'`, etc.) / dtype (e.g. 'float32') to generated test names. This fixes repeated instances of the device string being added to generated test names (e.g. `test_batch_norm_training_True_cuda_track_running_stats_True_cuda_affine_True_cuda`). The responsibility for placing device / dtype suffixes is now handled by `instantiate_device_type_tests()` instead so it is added a single time. It will place `<device>_<dtype>` at the end of the test name unconditionally, maintaining the current naming convention. As part of this work, I also tightened the semantics through some additional error case handling: * Composing multiple decorators that each try to handle the same parameter will error out with a nice message. This includes the case to trying to compose `modules` + `ops`, as they each try to handle `dtype`. Similarly, `ops` + `dtypes` is forbidden when both try to handle `dtype`. This required changes in the following test files: * `test/test_unary_ufuncs.py` * `test/test_foreach.py` * The `modules` / `ops` decorators will now error out with a nice message if used with `instantiate_parametrized_tests()` instead of `instantiate_device_type_tests()`, since they're not (currently) written to work outside of a device-specific context. Pull Request resolved: https://github.com/pytorch/pytorch/pull/65217 Reviewed By: mruberry Differential Revision: D32627303 Pulled By: jbschlosser fbshipit-source-id: c2957228353ed46a0b7da8fa1a34c67598779312 |
||
|
|
cdd5d16489 |
[Foreach] Implement L1&L2 norm (#62646)
Summary: Implement L1 & L2 norm in fast path with the reference of [nvidia/apex](https://github.com/NVIDIA/apex/blob/master/csrc/multi_tensor_l2norm_kernel.cu). When `ord` is neither 1 nor 2, then slow path is chosen. Related: https://github.com/pytorch/pytorch/issues/58833 cc ptrblck mcarilli ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/62646 Reviewed By: malfet Differential Revision: D32173421 Pulled By: ngimel fbshipit-source-id: 14b7544601658a979b83509df351e1848ded7675 |
||
|
|
c19cda5782 |
[skip ci] Add test owners for a special hi-pri class of tests (#67553)
Summary: Action following https://github.com/pytorch/pytorch/issues/66232 This change does require some context: there were several suggestions regarding what to do about this group of tests: tests that are core and crucial to all of PyTorch and are too broad to be owned by one team. 1. Let's add a "module: core" and put people behind it! This idea sounds appealing unless you are one of the people backing the label. From talking to albanD among others, this idea of putting all these core tests on the shoulder of a few people or one team isn't super fair and I have not yet found anyone willing to take on this job. 2. Taking advantage of the fact that we already have a triaging oncall that takes turns triaging issues, we can leave these tests essentially unlabeled and allow the oncall to triage these tests. Since these tests are crucial to PyTorch, we'll add the "high priority" label to mark them different from other unowned tests (see https://github.com/pytorch/pytorch/issues/67552). 3. I _could_ still create an unbacked label "module: core" and attribute these tests there, but I don't like the idea of creating a facade that the tests are "triaged" to a label when no one is actually taking a look. Now we could potentially break these tests down into smaller files so that each piece _could_ be owned by a team, but 1. I don't know if this is currently feasible and 2. This approach does not prevent that from happening in the future. Pull Request resolved: https://github.com/pytorch/pytorch/pull/67553 Reviewed By: albanD Differential Revision: D32025004 Pulled By: janeyx99 fbshipit-source-id: 1fb1aa4c27e305695ab6e80ae3d02f90519939c0 |
||
|
|
26b7ff5aea |
deprecate dtype getters from torch.testing namespace (#63554)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63554 Following https://github.com/pytorch/pytorch/pull/61840#issuecomment-884087809, this deprecates all the dtype getters publicly exposed in the `torch.testing` namespace. The reason for this twofold: 1. If someone is not familiar with the C++ dispatch macros PyTorch uses, the names are misleading. For example `torch.testing.floating_types()` will only give you `float32` and `float64` skipping `float16` and `bfloat16`. 2. The dtype getters provide very minimal functionality that can be easily emulated by downstream libraries. We thought about [providing an replacement](https://gist.github.com/pmeier/3dfd2e105842ad0de4505068a1a0270a), but ultimately decided against it. The major problem is BC: by keeping it, either the namespace is getting messy again after a new dtype is added or we need to somehow version the return values of the getters. Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D30662206 Pulled By: mruberry fbshipit-source-id: a2bdb10ab02ae665df1b5b76e8afa9af043bbf56 |
||
|
|
d37636901e |
[Doc] make_tensor to torch.testing module (#63925)
Summary: This PR aims to add `make_tensor` to the `torch.testing` module in PyTorch docs. TODOs: * [x] Add examples cc: pmeier mruberry brianjo Pull Request resolved: https://github.com/pytorch/pytorch/pull/63925 Reviewed By: ngimel Differential Revision: D30633487 Pulled By: mruberry fbshipit-source-id: 8e5a1f880c6ece5925b4039fee8122bd739538af |
||
|
|
1022443168 |
Revert D30279364: [codemod][lint][fbcode/c*] Enable BLACK by default
Test Plan: revert-hammer
Differential Revision:
D30279364 (
|
||
|
|
b004307252 |
[codemod][lint][fbcode/c*] Enable BLACK by default
Test Plan: manual inspection & sandcastle Reviewed By: zertosh Differential Revision: D30279364 fbshipit-source-id: c1ed77dfe43a3bde358f92737cd5535ae5d13c9a |
||
|
|
693b0af996 |
Port addcmul kernels to structured kernels. (#62318)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62318 Tracking issue: #55070 This PR introduces the method `TensorIteratorBase::build_ternary_op` for building a `TensorIteratorBase` for 3-input 1-output kernel. Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D29961997 Pulled By: bdhirsh fbshipit-source-id: 2208d24823bad6e74c8d508f363716d8125b8619 |
||
|
|
773a8eede4 |
[profiler][refactor] Refactor the usage of legacy profiler implementation (#61931)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61931 This PR consolidates the profiling code around a new C++ implementation (profiler_kineto.h/cpp) and uses it unconditionally from torch.autograd.profiler/torch.profiler: 1. Always use profiler_kineto.h/cpp as the C++ implementation 2. Simplify profiler.py to remove unneeded parts depending on legacy impl 3. Move some of the legacy logic into profiler_legacy.py (to be fully deleted later) Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install --cmake python test/test_profiler.py -v USE_KINETO=0 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install --cmake python test/test_profiler.py -v Imported from OSS Reviewed By: gdankel Differential Revision: D29801599 fbshipit-source-id: 9794d29f2af38dddbcd90dbce4481fc8575fa29e |
||
|
|
43d4fe68cd |
[Foreach] support implicit broadcasting in slow path (#62167)
Summary: This PR has foreach functions support implicit broadcasting via slow path. rel: https://github.com/pytorch/pytorch/issues/58833 cc: ptrblck ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/62167 Reviewed By: mruberry Differential Revision: D30005109 Pulled By: ngimel fbshipit-source-id: f48c0a13e304411763541ffcfcfc6154adb26bac |
||
|
|
8a2063e58a |
Foreach Test Refactor: Pointwise, Min/Max-imum (#61327)
Summary: - rewrite pointwise unittests using `ops` decorator - rewrite minimum&maximum unittests using `ops` decorator - enable minimum/maximum fastpath for BFloat16 - remove _test_data method https://github.com/pytorch/pytorch/issues/58833 cc: ptrblck ngimel Pull Request resolved: https://github.com/pytorch/pytorch/pull/61327 Reviewed By: albanD Differential Revision: D29830209 Pulled By: ngimel fbshipit-source-id: fa7805262b86c40fc32750b16629d80ad48ea4b5 |
||
|
|
fac744e116 |
Foreach Binary Test Refactor (#59907)
Summary: Related: https://github.com/pytorch/pytorch/issues/58833 ## Changes I'm a bit concerned - binary ops with one tensorlist and one scalarlist support complex dtypes. To realize this, I added a specialization of [`TensorListScalarListMetadata<c10::complex<double>, 1>` ](https://github.com/pytorch/pytorch/pull/59907/files#diff-131eb9b310905b15b3528da6a23e542a3a3aa952bc88f7423c98a23a8a28cca1R49). This might be out of the scope of this pull request. cc ptrblck ngimel mcarilli Pull Request resolved: https://github.com/pytorch/pytorch/pull/59907 Reviewed By: mruberry Differential Revision: D29551001 Pulled By: ngimel fbshipit-source-id: 46b25fdba85dd4d6332a77b27376fe96cd422384 |
||
|
|
52b2ed65c0 |
Revert D29007258: Revert D28926135: [pytorch][PR] Refactor Foreach Tests: Unary Functions
Test Plan: revert-hammer Differential Revision: D29007258 Original commit changeset: c15f51661641 fbshipit-source-id: 98236153136a5c6b6c2911079b7bd214da6cb424 |
||
|
|
171142f9cc |
Revert D28926135: [pytorch][PR] Refactor Foreach Tests: Unary Functions
Test Plan: revert-hammer
Differential Revision:
D28926135 (
|
||
|
|
0897df18a3 |
Refactor Foreach Tests: Unary Functions (#58960)
Summary: Related issue: https://github.com/pytorch/pytorch/issues/58833 __changes__ - slowpath tests: pass every dtype&device tensors and compare the behavior with regular functions including inplace - check of #cudaLaunchKernel - rename `ForeachUnaryFuncInfo` -> `ForeachFuncInfo`: This change is mainly for the future binary/pointwise test refactors cc: ngimel ptrblck mcarilli Pull Request resolved: https://github.com/pytorch/pytorch/pull/58960 Reviewed By: ejguan Differential Revision: D28926135 Pulled By: ngimel fbshipit-source-id: 4eb21dcebbffffaf79259e31961626e0707fb8d1 |
||
|
|
7eade660c6 |
[PyTorch] Reduce errors of foreach functions (#56993)
Summary: This is based on https://github.com/pytorch/pytorch/issues/48224. To make `foreach` more flexible, this PR pushes unsupported cases to slow path. Also, this adds some tests to verify that - `foreach` functions work with tensors of different dtypes and/or memory layouts in |