pytorch/benchmarks/dynamo/huggingface.yaml
Valentine233 67883e70c0 change GPT2ForSequenceClassification inference accuracy tolerance (#136749)
Fixes https://github.com/pytorch/pytorch/issues/123503.

https://github.com/pytorch/pytorch/pull/121866 makes GPT2ForSequenceClassification hit the SDPA pattern 18 and then encounter the accuracy issue. The issue only happens with BF16 inference single thread. This PR tends to increase the model tolerance from 4e-3 to 5e-3 and make the check pass. Note that the issue is due to some small implementation diff. For example, the sdpa math backend scales q, k before matmul for stability; the flash attention backend has more diffs as a new algorithm.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136749
Approved by: https://github.com/jgong5, https://github.com/jansel
2024-10-12 01:12:28 +00:00

111 lines
2.8 KiB
YAML

skip:
all:
# Difficult to setup accuracy test because .eval() not supported
- Reformer
# Fails deepcopy
- BlenderbotForConditionalGeneration
- GPTNeoForCausalLM
- GPTNeoForSequenceClassification
# Fails with even batch size = 1
- GPTJForCausalLM
- GPTJForQuestionAnswering
device:
cpu: []
control_flow:
- AllenaiLongformerBase
batch_size:
# TODO - Fails even after fake tensors
divisors:
AlbertForMaskedLM: 2
AlbertForQuestionAnswering: 2
AllenaiLongformerBase: 2
BartForCausalLM: 2
BartForConditionalGeneration: 2
BertForMaskedLM: 2
BertForQuestionAnswering: 2
BlenderbotForCausalLM: 8
# BlenderbotForConditionalGeneration : 16
BlenderbotSmallForCausalLM: 4
BlenderbotSmallForConditionalGeneration: 2
CamemBert: 2
DebertaForMaskedLM: 4
DebertaForQuestionAnswering: 2
DebertaV2ForMaskedLM: 4
DebertaV2ForQuestionAnswering: 8
DistilBertForMaskedLM: 2
DistilBertForQuestionAnswering: 2
DistillGPT2: 2
ElectraForCausalLM: 2
ElectraForQuestionAnswering: 2
GPT2ForSequenceClassification: 2
# GPTJForCausalLM : 2
# GPTJForQuestionAnswering : 2
# GPTNeoForCausalLM : 32
# GPTNeoForSequenceClassification : 2
GoogleFnet: 2
LayoutLMForMaskedLM: 2
LayoutLMForSequenceClassification: 2
M2M100ForConditionalGeneration: 4
MBartForCausalLM: 2
MBartForConditionalGeneration: 2
MT5ForConditionalGeneration: 2
MegatronBertForCausalLM: 4
MegatronBertForQuestionAnswering: 2
MobileBertForMaskedLM: 2
MobileBertForQuestionAnswering: 2
OPTForCausalLM: 2
PLBartForCausalLM: 2
PLBartForConditionalGeneration: 2
PegasusForCausalLM: 4
PegasusForConditionalGeneration: 2
RobertaForCausalLM: 2
RobertaForQuestionAnswering: 2
Speech2Text2ForCausalLM: 4
T5ForConditionalGeneration: 2
T5Small: 2
TrOCRForCausalLM: 2
XGLMForCausalLM: 4
XLNetLMHeadModel: 2
YituTechConvBert: 2
tolerance:
higher_training:
- MT5ForConditionalGeneration
# AlbertForQuestionAnswering fails in CI GCP A100 but error does not seem
# harmful.
- AlbertForQuestionAnswering
higher_max_autotune_training:
# DebertaForQuestionAnswering needs higher tolerance in Max-Autotune mode
- DebertaForQuestionAnswering
higher_inference:
- GPT2ForSequenceClassification
- RobertaForQuestionAnswering
higher_inference_cpu:
- LayoutLMForSequenceClassification
- GPT2ForSequenceClassification
cosine: []
accuracy:
skip:
large_models:
# Models too large to have eager, dynamo and fp64_numbers simultaneously
# even for 40 GB machine.
- DebertaV2ForMaskedLM
- BlenderbotForCausalLM
only_inference:
# Fails with dynamo for train mode
- M2M100ForConditionalGeneration
only_fp32:
- GoogleFnet