Skip to main content

Inference

pydantic model eole.config.inference.DecodingConfig

Bases: Config

Show JSON schema
{
"title": "DecodingConfig",
"type": "object",
"properties": {
"beam_size": {
"default": 5,
"description": "Beam size.",
"title": "Beam Size",
"type": "integer"
},
"ratio": {
"default": -0.0,
"description": "Ratio based beam stop condition.",
"title": "Ratio",
"type": "number"
},
"top_k": {
"default": 0,
"description": "Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.",
"title": "Top K",
"type": "integer"
},
"top_p": {
"default": 0.0,
"description": "Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0,1]. (https://arxiv.org/abs/1904.09751)",
"lte": 1.0,
"minimum": 0.0,
"title": "Top P",
"type": "number"
},
"temperature": {
"default": 1.0,
"description": "If doing random sampling, divide the logits by this before computing softmax during decoding.",
"title": "Temperature",
"type": "number"
},
"length_penalty": {
"default": "avg",
"description": "Length penalty to use.",
"enum": [
"avg",
"wu",
"none"
],
"title": "Length Penalty",
"type": "string"
},
"alpha": {
"default": 1.0,
"description": "Length penalty parameter (higher = longer generation)",
"title": "Alpha",
"type": "number"
},
"coverage_penalty": {
"default": "none",
"description": "Coverage penalty to use. Only available in beam search.",
"enum": [
"none",
"wu",
"summary"
],
"title": "Coverage Penalty",
"type": "string"
},
"beta": {
"default": -0.0,
"description": "Coverage penalty parameter.",
"title": "Beta",
"type": "number"
},
"stepwise_penalty": {
"default": false,
"description": "Apply coverage penalty at every decoding step. Helpful for summary penalty.",
"title": "Stepwise Penalty",
"type": "boolean"
},
"min_length": {
"default": 0,
"description": "Minimum prediction length.",
"minimum": 0,
"title": "Min Length",
"type": "integer"
},
"max_length": {
"default": 250,
"description": "Maximum prediction length.",
"title": "Max Length",
"type": "integer"
},
"max_length_ratio": {
"default": 2,
"description": "Maximum prediction length ratio. For European languages, 2 is large enough, for target Asian charageters, need to increase to 2-3, for special languages (Burmese, Amharic) to 10. Set to 0 to disable ratio-based length capping.",
"minimum": 0,
"title": "Max Length Ratio",
"type": "number"
},
"block_ngram_repeat": {
"default": 0,
"description": "Block repetition of ngrams during decoding.",
"title": "Block Ngram Repeat",
"type": "integer"
},
"ignore_when_blocking": {
"default": [],
"description": "Ignore these strings when blocking repeats. You want to block sentence delimiters.",
"items": {
"type": "string"
},
"title": "Ignore When Blocking",
"type": "array"
},
"replace_unk": {
"default": false,
"description": "Replace the generated UNK tokens with the source token that had the highest attention weight. If phrase_table is provided, it will lok up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.",
"title": "Replace Unk",
"type": "boolean"
},
"ban_unk_token": {
"default": false,
"description": "Prevent unk token generation by setting unk probability to 0.",
"title": "Ban Unk Token",
"type": "boolean"
},
"phrase_table": {
"default": "",
"description": "If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token.",
"title": "Phrase Table",
"type": "string"
},
"n_best": {
"default": 1,
"description": "Output the n_best decoded sentences.",
"title": "N Best",
"type": "integer"
},
"dump_beam": {
"default": "",
"description": "File to dump beam information to.",
"title": "Dump Beam",
"type": "string"
},
"verbose": {
"default": false,
"description": "Print scores and predictions for each input.",
"title": "Verbose",
"type": "boolean"
},
"with_score": {
"default": false,
"description": "Add a tab separated score to each output.",
"title": "With Score",
"type": "boolean"
},
"timestamps": {
"default": "none",
"description": "Audio models only. Timestamp output: 'none' = plain text, 'segment' = JSON with segment times, 'word' = per-word times via cross-attention DTW.",
"enum": [
"none",
"segment",
"word"
],
"title": "Timestamps",
"type": "string"
},
"language": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Audio models only. Language code (e.g. 'en', 'fr'). Inserts the language token into the decoder prefix.",
"title": "Language"
},
"task": {
"anyOf": [
{
"enum": [
"transcribe",
"translate"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Audio models only. 'transcribe' for same-language, 'translate' for translation to English.",
"title": "Task"
},
"initial_prompt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Audio models only. Text prompt to condition decoder output style and vocabulary. Prepended as previous context.",
"title": "Initial Prompt"
},
"condition_on_previous_text": {
"default": false,
"description": "Audio models only. Feed previous chunk's decoded text as decoder prompt for the next chunk.",
"title": "Condition On Previous Text",
"type": "boolean"
},
"fallback_temperatures": {
"default": [
0.0,
0.2,
0.4,
0.6,
0.8,
1.0
],
"description": "Audio models only. Temperature cascade for decoding fallback. First temperature uses beam search; subsequent use sampling. Set to [0.0] to disable fallback.",
"items": {
"type": "number"
},
"title": "Fallback Temperatures",
"type": "array"
},
"compression_ratio_threshold": {
"default": 2.4,
"description": "Audio models only. If gzip compression ratio of decoded text exceeds this, retry at next fallback temperature.",
"title": "Compression Ratio Threshold",
"type": "number"
},
"logprob_threshold": {
"default": -1.0,
"description": "Audio models only. If average log probability per token is below this, retry at next fallback temperature.",
"title": "Logprob Threshold",
"type": "number"
},
"no_speech_threshold": {
"default": 0.6,
"description": "Audio models only. Low avg_logprob only triggers fallback when no_speech_prob is also below this threshold.",
"maximum": 1.0,
"minimum": 0.0,
"title": "No Speech Threshold",
"type": "number"
},
"estim_only": {
"default": false,
"description": "Process the input to estimator only (no decoder).",
"title": "Estim Only",
"type": "boolean"
},
"attn_debug": {
"default": false,
"description": "Print best attn for each word.",
"title": "Attn Debug",
"type": "boolean"
},
"align_debug": {
"default": false,
"description": "Print best align for each word.",
"title": "Align Debug",
"type": "boolean"
}
},
"additionalProperties": false
}

field align_debug : bool = False

Print best align for each word.

field alpha : float = 1.0

Length penalty parameter (higher = longer generation)

field attn_debug : bool = False

Print best attn for each word.

field ban_unk_token : bool = False

Prevent unk token generation by setting unk probability to 0.

field beam_size : int = 5

Beam size.

field beta : float = -0.0

Coverage penalty parameter.

field block_ngram_repeat : int = 0

Block repetition of ngrams during decoding.

field compression_ratio_threshold : float = 2.4

Audio models only. If gzip compression ratio of decoded text exceeds this, retry at next fallback temperature.

field condition_on_previous_text : bool = False

Audio models only. Feed previous chunk’s decoded text as decoder prompt for the next chunk.

field coverage_penalty : Literal['none', 'wu', 'summary'] = 'none'

Coverage penalty to use. Only available in beam search.

field dump_beam : str = ''

File to dump beam information to.

field estim_only : bool = False

Process the input to estimator only (no decoder).

field fallback_temperatures : List[float] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

Audio models only. Temperature cascade for decoding fallback. First temperature uses beam search; subsequent use sampling. Set to [0.0] to disable fallback.

field ignore_when_blocking : List[str] = []

Ignore these strings when blocking repeats. You want to block sentence delimiters.

field initial_prompt : str | None = None

Audio models only. Text prompt to condition decoder output style and vocabulary. Prepended as previous context.

field language : str | None = None

Audio models only. Language code (e.g. ‘en’, ‘fr’). Inserts the language token into the decoder prefix.

field length_penalty : Literal['avg', 'wu', 'none'] = 'avg'

Length penalty to use.

field logprob_threshold : float = -1.0

Audio models only. If average log probability per token is below this, retry at next fallback temperature.

field max_length : int = 250

Maximum prediction length.

field max_length_ratio : float = 2

Maximum prediction length ratio. For European languages, 2 is large enough, for target Asian charageters, need to increase to 2-3, for special languages (Burmese, Amharic) to 10. Set to 0 to disable ratio-based length capping.

  • Constraints:
    • ge = 0

field min_length : int = 0

Minimum prediction length.

  • Constraints:
    • ge = 0

field n_best : int = 1

Output the n_best decoded sentences.

field no_speech_threshold : float = 0.6

Audio models only. Low avg_logprob only triggers fallback when no_speech_prob is also below this threshold.

  • Constraints:
    • ge = 0.0
    • le = 1.0

field phrase_table : str = ''

If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token.

field ratio : float = -0.0

Ratio based beam stop condition.

field replace_unk : bool = False

Replace the generated UNK tokens with the source token that had the highest attention weight. If phrase_table is provided, it will lok up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.

field stepwise_penalty : bool = False

Apply coverage penalty at every decoding step. Helpful for summary penalty.

field task : Literal['transcribe', 'translate'] | None = None

Audio models only. ‘transcribe’ for same-language, ‘translate’ for translation to English.

field temperature : float = 1.0

If doing random sampling, divide the logits by this before computing softmax during decoding.

field timestamps : Literal['none', 'segment', 'word'] = 'none'

Audio models only. Timestamp output: ‘none’ = plain text, ‘segment’ = JSON with segment times, ‘word’ = per-word times via cross-attention DTW.

field top_k : int = 0

Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.

field top_p : float = 0.0

Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0,1]. (https://arxiv.org/abs/1904.09751)

  • Constraints:
    • ge = 0.0

field verbose : bool = False

Print scores and predictions for each input.

field with_score : bool = False

Add a tab separated score to each output.

pydantic model eole.config.inference.InferenceConfig

Bases: RunningConfig, DecodingConfig, LoRaConfig, QuantizeConfig

Show JSON schema
{
"title": "InferenceConfig",
"type": "object",
"properties": {
"quant_layers": {
"default": [],
"description": "List of layers to be compressed in 4/8bit.",
"items": {
"type": "string"
},
"title": "Quant Layers",
"type": "array"
},
"quant_type": {
"default": "",
"description": "Type of compression.",
"enum": [
"",
"bnb_8bit",
"bnb_FP4",
"bnb_NF4",
"awq_gemm",
"awq_gemv",
"autoround",
"gguf"
],
"title": "Quant Type",
"type": "string"
},
"w_bit": {
"default": 4,
"description": "W_bit quantization",
"title": "W Bit",
"type": "integer"
},
"group_size": {
"default": 128,
"description": "Group size quantization.",
"title": "Group Size",
"type": "integer"
},
"autoround_packing_format": {
"default": "auto_round:auto_gptq",
"description": "AutoRound packing format (from quantization_config.packing_format). Determines whether qzeros use GPTQ-style (zeros-1) packing. Use 'auto_round:auto_gptq' for GPTQ-format (default), or 'auto_round' for direct zero-point.",
"title": "Autoround Packing Format",
"type": "string"
},
"autoround_sym": {
"default": true,
"description": "AutoRound symmetric quantization flag (from quantization_config.sym). Required to select the Marlin CUDA backend, which only supports symmetric quantization.",
"title": "Autoround Sym",
"type": "boolean"
},
"quant_exclude_modules": {
"default": [],
"description": "List of parent module names whose entire subtrees must not be quantized, even if child layers appear in quant_layers. Used for AutoRound models where some parent modules (e.g. shared_experts in MoE) were kept in fp16 during quantization.",
"items": {
"type": "string"
},
"title": "Quant Exclude Modules",
"type": "array"
},
"lora_layers": {
"default": [],
"description": "List of layers to be replaced by LoRa layers. E.g. ['linear_values', 'linear_query'] (\u00a74.2 in https://arxiv.org/abs/2106.09685)",
"items": {
"type": "string"
},
"title": "Lora Layers",
"type": "array"
},
"lora_embedding": {
"default": false,
"description": "Replace embeddings with LoRa Embeddings (\u00a75.1)",
"title": "Lora Embedding",
"type": "boolean"
},
"lora_rank": {
"default": 2,
"description": "r=2 successfully tested with NLLB-200 3.3B",
"title": "Lora Rank",
"type": "integer"
},
"lora_alpha": {
"default": 1,
"description": "\u00a74.1 https://arxiv.org/abs/2106.09685",
"title": "Lora Alpha",
"type": "integer"
},
"lora_dropout": {
"default": 0.0,
"description": "Rule of thumb: same value as in main model.",
"title": "Lora Dropout",
"type": "number"
},
"beam_size": {
"default": 5,
"description": "Beam size.",
"title": "Beam Size",
"type": "integer"
},
"ratio": {
"default": -0.0,
"description": "Ratio based beam stop condition.",
"title": "Ratio",
"type": "number"
},
"top_k": {
"default": 0,
"description": "Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.",
"title": "Top K",
"type": "integer"
},
"top_p": {
"default": 0.0,
"description": "Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0,1]. (https://arxiv.org/abs/1904.09751)",
"lte": 1.0,
"minimum": 0.0,
"title": "Top P",
"type": "number"
},
"temperature": {
"default": 1.0,
"description": "If doing random sampling, divide the logits by this before computing softmax during decoding.",
"title": "Temperature",
"type": "number"
},
"length_penalty": {
"default": "avg",
"description": "Length penalty to use.",
"enum": [
"avg",
"wu",
"none"
],
"title": "Length Penalty",
"type": "string"
},
"alpha": {
"default": 1.0,
"description": "Length penalty parameter (higher = longer generation)",
"title": "Alpha",
"type": "number"
},
"coverage_penalty": {
"default": "none",
"description": "Coverage penalty to use. Only available in beam search.",
"enum": [
"none",
"wu",
"summary"
],
"title": "Coverage Penalty",
"type": "string"
},
"beta": {
"default": -0.0,
"description": "Coverage penalty parameter.",
"title": "Beta",
"type": "number"
},
"stepwise_penalty": {
"default": false,
"description": "Apply coverage penalty at every decoding step. Helpful for summary penalty.",
"title": "Stepwise Penalty",
"type": "boolean"
},
"min_length": {
"default": 0,
"description": "Minimum prediction length.",
"minimum": 0,
"title": "Min Length",
"type": "integer"
},
"max_length": {
"default": 250,
"description": "Maximum prediction length.",
"title": "Max Length",
"type": "integer"
},
"max_length_ratio": {
"default": 2,
"description": "Maximum prediction length ratio. For European languages, 2 is large enough, for target Asian charageters, need to increase to 2-3, for special languages (Burmese, Amharic) to 10. Set to 0 to disable ratio-based length capping.",
"minimum": 0,
"title": "Max Length Ratio",
"type": "number"
},
"block_ngram_repeat": {
"default": 0,
"description": "Block repetition of ngrams during decoding.",
"title": "Block Ngram Repeat",
"type": "integer"
},
"ignore_when_blocking": {
"default": [],
"description": "Ignore these strings when blocking repeats. You want to block sentence delimiters.",
"items": {
"type": "string"
},
"title": "Ignore When Blocking",
"type": "array"
},
"replace_unk": {
"default": false,
"description": "Replace the generated UNK tokens with the source token that had the highest attention weight. If phrase_table is provided, it will lok up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.",
"title": "Replace Unk",
"type": "boolean"
},
"ban_unk_token": {
"default": false,
"description": "Prevent unk token generation by setting unk probability to 0.",
"title": "Ban Unk Token",
"type": "boolean"
},
"phrase_table": {
"default": "",
"description": "If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token.",
"title": "Phrase Table",
"type": "string"
},
"n_best": {
"default": 1,
"description": "Output the n_best decoded sentences.",
"title": "N Best",
"type": "integer"
},
"dump_beam": {
"default": "",
"description": "File to dump beam information to.",
"title": "Dump Beam",
"type": "string"
},
"verbose": {
"default": false,
"description": "Print scores and predictions for each input.",
"title": "Verbose",
"type": "boolean"
},
"with_score": {
"default": false,
"description": "Add a tab separated score to each output.",
"title": "With Score",
"type": "boolean"
},
"timestamps": {
"default": "none",
"description": "Audio models only. Timestamp output: 'none' = plain text, 'segment' = JSON with segment times, 'word' = per-word times via cross-attention DTW.",
"enum": [
"none",
"segment",
"word"
],
"title": "Timestamps",
"type": "string"
},
"language": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Audio models only. Language code (e.g. 'en', 'fr'). Inserts the language token into the decoder prefix.",
"title": "Language"
},
"task": {
"anyOf": [
{
"enum": [
"transcribe",
"translate"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Audio models only. 'transcribe' for same-language, 'translate' for translation to English.",
"title": "Task"
},
"initial_prompt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Audio models only. Text prompt to condition decoder output style and vocabulary. Prepended as previous context.",
"title": "Initial Prompt"
},
"condition_on_previous_text": {
"default": false,
"description": "Audio models only. Feed previous chunk's decoded text as decoder prompt for the next chunk.",
"title": "Condition On Previous Text",
"type": "boolean"
},
"fallback_temperatures": {
"default": [
0.0,
0.2,
0.4,
0.6,
0.8,
1.0
],
"description": "Audio models only. Temperature cascade for decoding fallback. First temperature uses beam search; subsequent use sampling. Set to [0.0] to disable fallback.",
"items": {
"type": "number"
},
"title": "Fallback Temperatures",
"type": "array"
},
"compression_ratio_threshold": {
"default": 2.4,
"description": "Audio models only. If gzip compression ratio of decoded text exceeds this, retry at next fallback temperature.",
"title": "Compression Ratio Threshold",
"type": "number"
},
"logprob_threshold": {
"default": -1.0,
"description": "Audio models only. If average log probability per token is below this, retry at next fallback temperature.",
"title": "Logprob Threshold",
"type": "number"
},
"no_speech_threshold": {
"default": 0.6,
"description": "Audio models only. Low avg_logprob only triggers fallback when no_speech_prob is also below this threshold.",
"maximum": 1.0,
"minimum": 0.0,
"title": "No Speech Threshold",
"type": "number"
},
"estim_only": {
"default": false,
"description": "Process the input to estimator only (no decoder).",
"title": "Estim Only",
"type": "boolean"
},
"attn_debug": {
"default": false,
"description": "Print best attn for each word.",
"title": "Attn Debug",
"type": "boolean"
},
"align_debug": {
"default": false,
"description": "Print best align for each word.",
"title": "Align Debug",
"type": "boolean"
},
"gpu_ranks": {
"default": [],
"description": "List of ranks for each process.",
"items": {
"type": "integer"
},
"title": "Gpu Ranks",
"type": "array"
},
"world_size": {
"default": 1,
"description": "Total number of distributed processes.",
"title": "World Size",
"type": "integer"
},
"parallel_mode": {
"default": "data_parallel",
"description": "Distributed mode.",
"enum": [
"data_parallel",
"tensor_parallel"
],
"title": "Parallel Mode",
"type": "string"
},
"gpu_backend": {
"default": "nccl",
"description": "Type of torch distributed backend.",
"title": "Gpu Backend",
"type": "string"
},
"gpu_verbose_level": {
"default": 0,
"description": "Gives more info on each process per GPU.",
"title": "Gpu Verbose Level",
"type": "integer"
},
"master_ip": {
"default": "localhost",
"description": "IP of master for torch.distributed training.",
"title": "Master Ip",
"type": "string"
},
"master_port": {
"default": 10000,
"description": "Port of master for torch.distributed training.",
"title": "Master Port",
"type": "integer"
},
"timeout": {
"default": 60,
"description": "Timeout for one GPU to wait for the others.",
"title": "Timeout",
"type": "integer"
},
"model_path": {
"default": "model",
"description": "Path to directory containing all model components.",
"title": "Model Path",
"type": "string"
},
"self_attn_backend": {
"default": "flash",
"description": "Self-attention backend.",
"enum": [
"flash",
"pytorch"
],
"title": "Self Attn Backend",
"type": "string"
},
"compute_dtype": {
"description": "Compute dtype (precision) to use for main compute. Some parameters might have other dtypes for specific cases (e.g. torch.amp -- See eole.config.training.TrainingConfig.storage_dtype) fp32 to force slow fp16 model on gtx1080, int8 to enable pytorch native 8-bit quantization (cpu only).",
"enum": [
"fp32",
"fp16",
"int8",
"bf16"
],
"title": "Compute Dtype",
"type": "string"
},
"torch_compile": {
"default": false,
"description": "Use torch.compile with dynamic=True.",
"title": "Torch Compile",
"type": "boolean"
},
"report_align": {
"default": false,
"description": "Report alignment for each translation.",
"title": "Report Align",
"type": "boolean"
},
"gold_align": {
"default": false,
"description": "Report alignment between source and gold target. Useful to test the performance of learnt alignments.",
"title": "Gold Align",
"type": "boolean"
},
"report_time": {
"default": false,
"description": "Report some translation time metrics.",
"title": "Report Time",
"type": "boolean"
},
"fuse_kvq": {
"default": false,
"description": "Fuse K, V, Q Linear layers into a single KVQ in Self Attn.",
"title": "Fuse Kvq",
"type": "boolean"
},
"fuse_gate": {
"default": false,
"description": "Fuse gate_up_proj and up_proj Linear layers into a single Linear.",
"title": "Fuse Gate",
"type": "boolean"
},
"profile": {
"default": false,
"description": "Report pytorch profiling stats.",
"title": "Profile",
"type": "boolean"
},
"batch_size": {
"default": 30,
"description": "Batch size.",
"title": "Batch Size",
"type": "integer"
},
"dynamic_shapes": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Use batch_size / Cache length static or Dynamic",
"title": "Dynamic Shapes"
},
"batch_type": {
"default": "sents",
"description": "Batch grouping for batch size.",
"enum": [
"sents",
"tokens"
],
"title": "Batch Type",
"type": "string"
},
"avg_raw_probs": {
"default": false,
"description": "If set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.",
"title": "Avg Raw Probs",
"type": "boolean"
},
"data_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "text",
"title": "Data Type"
},
"chat_template": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Chat Template"
},
"optional_eos": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "Optional EOS tokens that would stop generation, e.g. <|eot_id|> for Llama3",
"title": "Optional Eos"
}
},
"additionalProperties": false
}

field avg_raw_probs : bool = False

If set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.

  • Validated by:
    • _validate_running_config

field batch_size : int = 30

Batch size.

  • Validated by:
    • _validate_running_config

field batch_type : Literal['sents', 'tokens'] = 'sents'

Batch grouping for batch size.

  • Validated by:
    • _validate_running_config

field chat_template : str | None = None

  • Validated by:
    • _validate_running_config

field data_type : str | None = 'text'

  • Validated by:
    • _validate_running_config

field dynamic_shapes : bool | None = None

Use batch_size / Cache length static or Dynamic

  • Validated by:
    • _validate_running_config

field fuse_gate : bool = False

Fuse gate_up_proj and up_proj Linear layers into a single Linear.

  • Validated by:
    • _validate_running_config

field fuse_kvq : bool = False

Fuse K, V, Q Linear layers into a single KVQ in Self Attn.

  • Validated by:
    • _validate_running_config

field gold_align : bool = False

Report alignment between source and gold target. Useful to test the performance of learnt alignments.

  • Validated by:
    • _validate_running_config

field optional_eos : List[str] | None = []

Optional EOS tokens that would stop generation, e.g. <

|eot_id|

for Llama3

  • Validated by:
    • _validate_running_config

field profile : bool = False

Report pytorch profiling stats.

  • Validated by:
    • _validate_running_config

field report_align : bool = False

Report alignment for each translation.

  • Validated by:
    • _validate_running_config

field report_time : bool = False

Report some translation time metrics.

  • Validated by:
    • _validate_running_config

get_model_path()

property storage_dtype : dtype

Deduce which dtype to use for main model parameters.