Skip to main content

Models

Base Configs

pydantic model eole.config.models.BaseModelConfig

Bases: Config

Show JSON schema
{
"title": "BaseModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Architecture"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field adapter_bias : bool = False

Control whether or not the adapter module has bias weights.

field add_estimator : bool = False

Add estimator layer

field architecture : str | None = None

field decoder : TransformerDecoderConfig | RnnDecoderConfig | CnnDecoderConfig | None = None

Major parameters of a decoder.

field embeddings : EmbeddingsConfig [Optional]

Contains most of the args useful to build the Embeddings module.

field encoder : TransformerEncoderConfig | RnnEncoderConfig | CnnEncoderConfig | MeanEncoderConfig | VisionEncoderConfig | AudioEncoderConfig | None = None

Major parameters of an encoder.

field eole_version : str | None = '0.5.2'

Eole version used to convert/train/save the model.

field estimator_type : Literal['average', 'last_token', 'first_token'] = 'average'

Which hidden_states to use to feed the estimator

field generator_bias : bool = True

Control whether or not the generator Linear module has bias weights.

field generator_function : Literal['softmax', 'sparsemax'] = 'softmax'

Which function to use for generating probabilities over the target vocabulary.

field hidden_size : int = -1

Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.

field huggingface_model : str | None = None

Original huggingface model.

field input_feed : int = 1

Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.

field layers : int = -1

Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).

field left_pad : bool = False

Enable left-padding, useful for some LLMs.

field moe_transformer_ff : int | None = None

Size of hidden moe transformer feed-forward.

field projector_activation_fn : ActivationFunction | None = ActivationFunction.relu

The activation function to use in adapter projector layer.

field share_decoder_embeddings : bool = False

Use a share weight matrix for the input and output word embeddings in the decoder.

field share_embeddings : bool = False

Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.

field spatial_merge_size : int | None = 1

Control the presence and size of patch merger (Mistral3)

field transformer_ff : int = -1

Size of hidden transformer feed-forward.

field word_vec_size : int = -1

Word embedding size for src and tgt.

validator default_architecture » all fields

validator str_to_dict » decoder , embeddings , encoder

validator validate_merge_size » spatial_merge_size

update_model_opts()

property model_type : ModelType

pydantic model eole.config.models.EmbeddingsConfig

Bases: Config

Show JSON schema
{
"title": "EmbeddingsConfig",
"type": "object",
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"$defs": {
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
}
},
"additionalProperties": false
}

field freeze_word_vecs_dec : bool = False

Freeze word embeddings on the encoder side.

field freeze_word_vecs_enc : bool = False

Freeze word embeddings on the encoder side.

field n_positions : int | None = None

Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative

field normalize : bool | None = False

Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909

field position_encoding : bool = False

Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.

field position_encoding_type : PositionEncodingType | None = PositionEncodingType.SinusoidalInterleaved

Type of positional encoding.

field position_shift : int | None = 0

Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl

field src_word_vec_size : int = 512

Word embedding size for src.

field tgt_word_vec_size : int = 512

Word embedding size for tgt.

field word_vec_size : int = -1

Word embedding size for src and tgt.

validator validate_embeddings » all fields

pydantic model eole.config.models.EncoderConfig

Bases: Config

Abstract class for all encoders

Show JSON schema
{
"title": "EncoderConfig",
"description": "Abstract class for all encoders",
"type": "object",
"properties": {
"encoder_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "rnn",
"description": "Type of encoder layer(s) to use.",
"title": "Encoder Type"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"additionalProperties": false
}

field encoder_type : str | None = 'rnn'

Type of encoder layer(s) to use.

field hidden_size : int = 512

Size of encoder hidden states.

field layers : int = 2

Number of layers in the encoder.

field src_word_vec_size : int = 512

Word embedding size for src.

property data_type : str

pydantic model eole.config.models.DecoderConfig

Bases: Config

Abstract class for all decoders

Show JSON schema
{
"title": "DecoderConfig",
"description": "Abstract class for all decoders",
"type": "object",
"properties": {
"decoder_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "rnn",
"description": "Type of decoder layer(s) to use.",
"title": "Decoder Type"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
}
},
"additionalProperties": false
}

field coverage_attn : bool = False

Train a coverage attention layer.

  • Validated by:
    • _validate_decoder_config

field decoder_type : str | None = 'rnn'

Type of decoder layer(s) to use.

  • Validated by:
    • _validate_decoder_config

field global_attention : Literal['dot', 'general', 'mlp', None] = 'general'

The attention type to use. (Luong=general, Bahdanau=MLP)

  • Validated by:
    • _validate_decoder_config

field global_attention_function : Literal['softmax', 'sparsemax'] = 'softmax'

Global attention function to use.

  • Validated by:
    • _validate_decoder_config

field hidden_size : int = 512

Size of decoder hidden states.

  • Validated by:
    • _validate_decoder_config

field lambda_coverage : float = 0.0

Lambda value for coverage loss of See et al (2017)

  • Validated by:
    • _validate_decoder_config

field layers : int = 2

Number of layers in the decoder.

  • Validated by:
    • _validate_decoder_config

field tgt_word_vec_size : int = 512

Word embedding size for tgt.

  • Validated by:
    • _validate_decoder_config

field with_cross_attn : bool = False

Decoder uses cross-attention with encoder outputs.

  • Validated by:
    • _validate_decoder_config

pydantic model eole.config.models.CustomModelConfig

Bases: TransformerConfig, BaseModelConfig

Wrap anything that does not fit a set common architecture.

Show JSON schema
{
"title": "CustomModelConfig",
"description": "Wrap anything that does not fit a set common architecture.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "custom",
"default": "custom",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field architecture : Literal['custom'] = 'custom'

  • Validated by:

Transformer

pydantic model eole.config.models.TransformerConfig

Bases: Config

This base TransformerConfig class regroups parameters than can both be set at model level or either encoder/decoder level. BaseModelConfig._override_values validator overrides encoder/decoder values with model values if relevant.

Show JSON schema
{
"title": "TransformerConfig",
"description": "This base TransformerConfig class regroups parameters than can\nboth be set at model level or either encoder/decoder level.\nBaseModelConfig._override_values validator overrides\nencoder/decoder values with model values if relevant.",
"type": "object",
"properties": {
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

field add_ffnbias : bool = False

Add bias to nn.Linear of MLP FFN.

  • Validated by:
    • _validate_transformer_config

field add_final_linear_bias : bool = False

Add bias to nn.Linear of final_linear in MHA.

  • Validated by:
    • _validate_transformer_config

field add_key_bias : bool | None = None

Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.

  • Validated by:
    • _validate_transformer_config

field add_qkvbias : bool = False

Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with add_final_linear_bias.

  • Validated by:
    • _validate_transformer_config

field attn_scaling : float | None = None

Attention scaling factor, when None uses 1/sqrt(head_dim) by default

  • Validated by:
    • _validate_transformer_config

field ffn_layernorm : bool = False

Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.

  • Validated by:
    • _validate_transformer_config

field first_k_dense_replace : int = 0

Number of layers using Dense instead of MoE

  • Validated by:
    • _validate_transformer_config

field head_dim : int | None = None

Head dimension when this needs to be different vs hidden_size // heads

  • Validated by:
    • _validate_transformer_config

field heads : int = 8

Number of heads for transformer self-attention.

  • Validated by:
    • _validate_transformer_config

field heads_kv : int | None = None

Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)

  • Validated by:
    • _validate_transformer_config

field interpolate_mode : str | None = None

Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., ‘bilinear’): position_embedding is a learned grid using interpolation. (see Vision.py encoder)

  • Validated by:
    • _validate_transformer_config

field key_norm : bool = False

  • Validated by:
    • _validate_transformer_config

field layer_norm : Literal['standard', 'standardFP32', 'rms', 'gemma-rms'] = 'standard'

Type of layer normalization in transformer architecture.

  • Validated by:
    • _validate_transformer_config

field mlp_activation_fn : ActivationFunction = ActivationFunction.relu

The activation function to use in MLP layer.

  • Validated by:
    • _validate_transformer_config

field moe_renormalize : bool = False

Qwen renormalize expert weights after softmax.

  • Validated by:
    • _validate_transformer_config

field moe_softmax_after : bool = False

Usually softmax is before topk, Mixtral does it after.

  • Validated by:
    • _validate_transformer_config

field moe_transformer_ff : int | None = None

Size of hidden moe transformer feed-forward.

  • Validated by:
    • _validate_transformer_config

field n_positions : int | None = None

Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative

  • Validated by:
    • _validate_transformer_config

field norm_eps : float = 1e-05

Layer norm epsilon.

  • Validated by:
    • _validate_transformer_config

field num_experts : int = 0

Number of experts for MoE models.

  • Validated by:
    • _validate_transformer_config

field num_experts_per_tok : int = 2

Number of experts per token.

  • Validated by:
    • _validate_transformer_config

field num_shared_experts : int = 0

Number of shared experts for MoE models (DeepSeekv2).

  • Validated by:
    • _validate_transformer_config

field parallel_residual : bool = False

Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.

  • Validated by:
    • _validate_transformer_config

field position_encoding_type : PositionEncodingType | None = PositionEncodingType.SinusoidalInterleaved

Type of positional encoding.

  • Validated by:
    • _validate_transformer_config

field q_gating : bool = False

Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).

  • Validated by:
    • _validate_transformer_config

field qk_norm_post_rope : bool = False

  • Validated by:
    • _validate_transformer_config

field query_norm : bool = False

  • Validated by:
    • _validate_transformer_config

field relative_positions_buckets : int = 0

Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).

  • Validated by:
    • _validate_transformer_config

field rope_config : RotaryPositionConfig | None = None

Rotary position config, if relevant.

  • Validated by:
    • _validate_transformer_config

field shared_expert_gate : bool = False

Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).

  • Validated by:
    • _validate_transformer_config

field shared_layer_norm : bool = False

Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.

  • Validated by:
    • _validate_transformer_config

field sliding_window : int = 0

Sliding window for transformer self-attention.

  • Validated by:
    • _validate_transformer_config

field transformer_ff : int = 2048

Size of hidden transformer feed-forward.

  • Validated by:
    • _validate_transformer_config

property dim_per_head : int

pydantic model eole.config.models.TransformerEncoderConfig

Bases: TransformerConfig, EncoderConfig

Show JSON schema
{
"title": "TransformerEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field encoder_type : Literal['transformer'] = 'transformer'

  • Validated by:
    • _validate_transformer_config

pydantic model eole.config.models.TransformerDecoderConfig

Bases: TransformerConfig, DecoderConfig

Show JSON schema
{
"title": "TransformerDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

field LM_type : Literal['causal', 'prefix'] = 'causal'

TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field aan_useffn : bool = False

Turn on the FFN layer in the AAN decoder.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field alignment_heads : int = 0

Number of cross attention heads per layer to supervise with.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field alignment_layer : int = -2

Layer number which has to be supervised.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field decoder_type : Literal['transformer'] = 'transformer'

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field full_context_alignment : bool = False

Whether alignment is conditioned on full target context.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field lambda_align : float = 0.0

Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field layer_types : List[str] | None = None

Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: ‘full_attention’, ‘linear_attention’. When None, all layers use full attention.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field linear_conv_kernel_dim : int = 4

Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field linear_key_head_dim : int = 128

Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field linear_num_key_heads : int = 16

Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field linear_num_value_heads : int = 32

Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field linear_value_head_dim : int = 128

Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

pydantic model eole.config.models.TransformerModelConfig

Bases: TransformerConfig, BaseModelConfig

Facilitate setting some transformer specific params at model level.

Show JSON schema
{
"title": "TransformerModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "transformer",
"default": "transformer",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['transformer'] = 'transformer'

validator default_architecture » all fields

validator encoder_decoder_type » all fields

pydantic model eole.config.models.TransformerLMModelConfig

Bases: TransformerConfig, BaseModelConfig

Facilitate setting some transformer specific params at model level.

Show JSON schema
{
"title": "TransformerLMModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder",
"type": "null"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "transformer_lm",
"default": "transformer_lm",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['transformer_lm'] = 'transformer_lm'

field encoder : None = None

Major parameters of an encoder.

validator default_architecture » all fields

validator encoder_decoder_type » all fields

pydantic model eole.config.models.TransformerEncoderModelConfig

Bases: TransformerConfig, BaseModelConfig

Facilitate setting some transformer specific params at model level.

Show JSON schema
{
"title": "TransformerEncoderModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder",
"type": "null"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "transformer_encoder",
"default": "transformer_encoder",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['transformer_encoder'] = 'transformer_encoder'

field decoder : None = None

Major parameters of a decoder.

validator default_architecture » all fields

validator encoder_decoder_type » all fields

pydantic model eole.config.models.VisionTransformerLMModelConfig

Bases: TransformerConfig, BaseModelConfig

Show JSON schema
{
"title": "VisionTransformerLMModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "vision_transformer_lm",
"default": "vision_transformer_lm",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"adapter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "llava",
"description": "Adapter type to use in the model.",
"title": "Adapter"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field adapter : str | None = 'llava'

Adapter type to use in the model.

field architecture : Literal['vision_transformer_lm'] = 'vision_transformer_lm'

validator default_architecture » all fields

validator encoder_decoder_type » all fields

property image_size

property patch_size

pydantic model eole.config.models.WhisperModelConfig

Bases: TransformerConfig, BaseModelConfig

Configuration for Whisper speech-to-text models.

Show JSON schema
{
"title": "WhisperModelConfig",
"description": "Configuration for Whisper speech-to-text models.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "whisper",
"default": "whisper",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"suppress_tokens": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of token IDs to suppress during audio decoding.",
"title": "Suppress Tokens"
},
"begin_suppress_tokens": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of token IDs to suppress at the first generated position.",
"title": "Begin Suppress Tokens"
},
"no_timestamps_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Token ID for the no-timestamps token in audio models.",
"title": "No Timestamps Token Id"
},
"word_timestamp_heads": {
"anyOf": [
{
"items": {
"items": {
"type": "integer"
},
"type": "array"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of [layer, head] pairs for word-level timestamp extraction (mapped from alignment_heads in HF generation_config).",
"title": "Word Timestamp Heads"
},
"median_filter_width": {
"default": 7,
"description": "Median filter width for word-level timestamp smoothing.",
"title": "Median Filter Width",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['whisper'] = 'whisper'

field begin_suppress_tokens : List[int] | None = None

List of token IDs to suppress at the first generated position.

field median_filter_width : int = 7

Median filter width for word-level timestamp smoothing.

field no_timestamps_token_id : int | None = None

Token ID for the no-timestamps token in audio models.

field suppress_tokens : List[int] | None = None

List of token IDs to suppress during audio decoding.

field word_timestamp_heads : List[List[int]] | None = None

List of [layer, head] pairs for word-level timestamp extraction (mapped from alignment_heads in HF generation_config).

validator default_architecture » all fields

validator encoder_decoder_type » all fields

Vision / Audio Encoders

pydantic model eole.config.models.VisionEncoderConfig

Bases: TransformerConfig, EncoderConfig

Based on mistral-community/pixtral-12b, might evolve later.

Show JSON schema
{
"title": "VisionEncoderConfig",
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"type": "object",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

field encoder_sam : bool = False

  • Validated by:
    • _validate_transformer_config

field encoder_type : Literal['vision'] = 'vision'

  • Validated by:
    • _validate_transformer_config

field image_size : int | None = 1024

  • Validated by:
    • _validate_transformer_config

field image_token_id : int | None = 10

  • Validated by:
    • _validate_transformer_config

field image_token_id_list : List[int] | None = None

includes other image_token ids

  • Validated by:
    • _validate_transformer_config

field layernorm_post : bool = False

  • Validated by:
    • _validate_transformer_config

field layernorm_pre : bool = True

  • Validated by:
    • _validate_transformer_config

field mm_tokens_per_image : int | None = 256

  • Validated by:
    • _validate_transformer_config

field num_channels : int | None = 3

  • Validated by:
    • _validate_transformer_config

field num_position_embeddings : int | None = None

Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48×48). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.

  • Validated by:
    • _validate_transformer_config

field patch_conv_bias : bool = False

  • Validated by:
    • _validate_transformer_config

field patch_size : int | None = 16

  • Validated by:
    • _validate_transformer_config

field temporal_patch_size : int = 1

Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).

  • Validated by:
    • _validate_transformer_config

field use_class_embedding : bool = False

  • Validated by:
    • _validate_transformer_config

property data_type : str

pydantic model eole.config.models.AudioEncoderConfig

Bases: TransformerConfig, EncoderConfig

Configuration for audio encoder.

Show JSON schema
{
"title": "AudioEncoderConfig",
"description": "Configuration for audio encoder.",
"type": "object",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

field chunk_length : int = 30

Audio chunk length in seconds.

  • Validated by:
    • _validate_transformer_config

field encoder_type : Literal['audio'] = 'audio'

  • Validated by:
    • _validate_transformer_config

field hop_length : int = 160

Hop length for mel spectrogram.

  • Validated by:
    • _validate_transformer_config

field max_source_positions : int = 1500

Maximum number of source positions (time frames after conv stem).

  • Validated by:
    • _validate_transformer_config

field n_fft : int = 400

FFT window size for mel spectrogram.

  • Validated by:
    • _validate_transformer_config

field num_mel_bins : int = 80

Number of mel spectrogram bins.

  • Validated by:
    • _validate_transformer_config

field position_encoding_type : PositionEncodingType | None = None

  • Validated by:
    • _validate_transformer_config

field sample_rate : int = 16000

Audio sample rate in Hz.

  • Validated by:
    • _validate_transformer_config

field timestamp_resolution : float = 0.02

Time resolution per timestamp token in seconds.

  • Validated by:
    • _validate_transformer_config

property data_type : str

RNN

pydantic model eole.config.models.RnnConfig

Bases: Config

Just to facilitate testing discriminator stuff.

Show JSON schema
{
"title": "RnnConfig",
"description": "Just to facilitate testing discriminator stuff.",
"type": "object",
"properties": {
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"additionalProperties": false
}

field bridge : bool = False

Have an additional layer between the last encoder state and the first decoder state (RNN specific).

field rnn_type : Literal['LSTM', 'GRU'] = 'LSTM'

The gate type to use in the RNNs.

pydantic model eole.config.models.RnnEncoderConfig

Bases: RnnConfig, EncoderConfig

Show JSON schema
{
"title": "RnnEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field encoder_type : Literal['rnn', 'brnn'] = 'rnn'

pydantic model eole.config.models.RnnDecoderConfig

Bases: RnnConfig, DecoderConfig

Show JSON schema
{
"title": "RnnDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"additionalProperties": false
}

field bidirectional_encoder : bool | None = False

  • Validated by:
    • _validate_decoder_config

field context_gate : Literal['source', 'target', 'both', None] = None

Type of context gate to use.

  • Validated by:
    • _validate_decoder_config

field decoder_type : Literal['rnn'] = 'rnn'

  • Validated by:
    • _validate_decoder_config

pydantic model eole.config.models.RnnModelConfig

Bases: RnnConfig, BaseModelConfig

Show JSON schema
{
"title": "RnnModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "rnn",
"default": "rnn",
"title": "Architecture",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['rnn'] = 'rnn'

validator default_architecture » all fields

validator encoder_decoder_type » all fields

CNN

pydantic model eole.config.models.CnnConfig

Bases: Config

Show JSON schema
{
"title": "CnnConfig",
"type": "object",
"properties": {
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field cnn_kernel_width : int = 3

Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.

pydantic model eole.config.models.CnnEncoderConfig

Bases: CnnConfig, EncoderConfig

Show JSON schema
{
"title": "CnnEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field encoder_type : Literal['cnn'] = 'cnn'

pydantic model eole.config.models.CnnDecoderConfig

Bases: CnnConfig, DecoderConfig

Show JSON schema
{
"title": "CnnDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field decoder_type : Literal['cnn'] = 'cnn'

  • Validated by:
    • _validate_decoder_config

pydantic model eole.config.models.CnnModelConfig

Bases: CnnConfig, BaseModelConfig

Show JSON schema
{
"title": "CnnModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "cnn",
"default": "cnn",
"title": "Architecture",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['cnn'] = 'cnn'

validator default_architecture » all fields

validator encoder_decoder_type » all fields