Models
Base Configs
pydantic model eole.config.models.BaseModelConfig
Bases: Config
Show JSON schema
{
"title": "BaseModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Architecture"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
adapter_bias (bool)add_estimator (bool)architecture (str | None)decoder (eole.config.models.TransformerDecoderConfig | eole.config.models.RnnDecoderConfig | eole.config.models.CnnDecoderConfig | None)embeddings (eole.config.models.EmbeddingsConfig)encoder (eole.config.models.TransformerEncoderConfig | eole.config.models.RnnEncoderConfig | eole.config.models.CnnEncoderConfig | eole.config.models.MeanEncoderConfig | eole.config.models.VisionEncoderConfig | eole.config.models.AudioEncoderConfig | None)eole_version (str | None)estimator_type (Literal['average', 'last_token', 'first_token'])generator_bias (bool)generator_function (Literal['softmax', 'sparsemax'])hidden_size (int)huggingface_model (str | None)input_feed (int)layers (int)left_pad (bool)moe_transformer_ff (int | None)projector_activation_fn (eole.constants.ActivationFunction | None)share_decoder_embeddings (bool)share_embeddings (bool)spatial_merge_size (int | None)transformer_ff (int)word_vec_size (int)
- Validators:
_override_values»all fields_validate_model_config»all fieldsdefault_architecture»all fieldsstr_to_dict»decoderstr_to_dict»embeddingsstr_to_dict»encodervalidate_merge_size»spatial_merge_size
field adapter_bias : bool = False
Control whether or not the adapter module has bias weights.
- Validated by:
_override_values_validate_model_configdefault_architecture
field add_estimator : bool = False
Add estimator layer
- Validated by:
_override_values_validate_model_configdefault_architecture
field architecture : str | None = None
- Validated by:
_override_values_validate_model_configdefault_architecture
field decoder : TransformerDecoderConfig | RnnDecoderConfig | CnnDecoderConfig | None = None
Major parameters of a decoder.
- Validated by:
_override_values_validate_model_configdefault_architecturestr_to_dict
field embeddings : EmbeddingsConfig [Optional]
Contains most of the args useful to build the Embeddings module.
- Validated by:
_override_values_validate_model_configdefault_architecturestr_to_dict
field encoder : TransformerEncoderConfig | RnnEncoderConfig | CnnEncoderConfig | MeanEncoderConfig | VisionEncoderConfig | AudioEncoderConfig | None = None
Major parameters of an encoder.
- Validated by:
_override_values_validate_model_configdefault_architecturestr_to_dict
field eole_version : str | None = '0.5.2'
Eole version used to convert/train/save the model.
- Validated by:
_override_values_validate_model_configdefault_architecture
field estimator_type : Literal['average', 'last_token', 'first_token'] = 'average'
Which hidden_states to use to feed the estimator
- Validated by:
_override_values_validate_model_configdefault_architecture
field generator_bias : bool = True
Control whether or not the generator Linear module has bias weights.
- Validated by:
_override_values_validate_model_configdefault_architecture
field generator_function : Literal['softmax', 'sparsemax'] = 'softmax'
Which function to use for generating probabilities over the target vocabulary.
- Validated by:
_override_values_validate_model_configdefault_architecture
field hidden_size : int = -1
Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.
- Validated by:
_override_values_validate_model_configdefault_architecture
field huggingface_model : str | None = None
Original huggingface model.
- Validated by:
_override_values_validate_model_configdefault_architecture
field input_feed : int = 1
Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.
- Validated by:
_override_values_validate_model_configdefault_architecture
field layers : int = -1
Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).
- Validated by:
_override_values_validate_model_configdefault_architecture
field left_pad : bool = False
Enable left-padding, useful for some LLMs.
- Validated by:
_override_values_validate_model_configdefault_architecture
field moe_transformer_ff : int | None = None
Size of hidden moe transformer feed-forward.
- Validated by:
_override_values_validate_model_configdefault_architecture
field projector_activation_fn : ActivationFunction | None = ActivationFunction.relu
The activation function to use in adapter projector layer.
- Validated by:
_override_values_validate_model_configdefault_architecture
field share_decoder_embeddings : bool = False
Use a share weight matrix for the input and output word embeddings in the decoder.
- Validated by:
_override_values_validate_model_configdefault_architecture
field share_embeddings : bool = False
Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.
- Validated by:
_override_values_validate_model_configdefault_architecture
field spatial_merge_size : int | None = 1
Control the presence and size of patch merger (Mistral3)
- Validated by:
_override_values_validate_model_configdefault_architecturevalidate_merge_size
field transformer_ff : int = -1
Size of hidden transformer feed-forward.
- Validated by:
_override_values_validate_model_configdefault_architecture
field word_vec_size : int = -1
Word embedding size for src and tgt.
- Validated by:
_override_values_validate_model_configdefault_architecture
validator default_architecture » all fields
validator str_to_dict » decoder , embeddings , encoder
validator validate_merge_size » spatial_merge_size
update_model_opts()
property model_type : ModelType
pydantic model eole.config.models.EmbeddingsConfig
Bases: Config
Show JSON schema
{
"title": "EmbeddingsConfig",
"type": "object",
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"$defs": {
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
validate_embeddings»all fields
field freeze_word_vecs_dec : bool = False
Freeze word embeddings on the encoder side.
- Validated by:
field freeze_word_vecs_enc : bool = False
Freeze word embeddings on the encoder side.
- Validated by:
field n_positions : int | None = None
Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative
- Validated by:
field normalize : bool | None = False
Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909
- Validated by:
field position_encoding : bool = False
Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.
- Validated by:
field position_encoding_type : PositionEncodingType | None = PositionEncodingType.SinusoidalInterleaved
Type of positional encoding.
- Validated by:
field position_shift : int | None = 0
Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl
- Validated by:
field src_word_vec_size : int = 512
Word embedding size for src.
- Validated by:
field tgt_word_vec_size : int = 512
Word embedding size for tgt.
- Validated by:
field word_vec_size : int = -1
Word embedding size for src and tgt.
- Validated by:
validator validate_embeddings » all fields
pydantic model eole.config.models.EncoderConfig
Bases: Config
Abstract class for all encoders
Show JSON schema
{
"title": "EncoderConfig",
"description": "Abstract class for all encoders",
"type": "object",
"properties": {
"encoder_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "rnn",
"description": "Type of encoder layer(s) to use.",
"title": "Encoder Type"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
field encoder_type : str | None = 'rnn'
Type of encoder layer(s) to use.
field hidden_size : int = 512
Size of encoder hidden states.
field layers : int = 2
Number of layers in the encoder.
field src_word_vec_size : int = 512
Word embedding size for src.
property data_type : str
pydantic model eole.config.models.DecoderConfig
Bases: Config
Abstract class for all decoders
Show JSON schema
{
"title": "DecoderConfig",
"description": "Abstract class for all decoders",
"type": "object",
"properties": {
"decoder_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "rnn",
"description": "Type of decoder layer(s) to use.",
"title": "Decoder Type"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
_validate_decoder_config»all fields
field coverage_attn : bool = False
Train a coverage attention layer.
- Validated by:
_validate_decoder_config
field decoder_type : str | None = 'rnn'
Type of decoder layer(s) to use.
- Validated by:
_validate_decoder_config
field global_attention : Literal['dot', 'general', 'mlp', None] = 'general'
The attention type to use. (Luong=general, Bahdanau=MLP)
- Validated by:
_validate_decoder_config
field global_attention_function : Literal['softmax', 'sparsemax'] = 'softmax'
Global attention function to use.
- Validated by:
_validate_decoder_config
field hidden_size : int = 512
Size of decoder hidden states.
- Validated by:
_validate_decoder_config
field lambda_coverage : float = 0.0
Lambda value for coverage loss of See et al (2017)
- Validated by:
_validate_decoder_config
field layers : int = 2
Number of layers in the decoder.
- Validated by:
_validate_decoder_config
field tgt_word_vec_size : int = 512
Word embedding size for tgt.
- Validated by:
_validate_decoder_config
field with_cross_attn : bool = False
Decoder uses cross-attention with encoder outputs.
- Validated by:
_validate_decoder_config
pydantic model eole.config.models.CustomModelConfig
Bases: TransformerConfig, BaseModelConfig
Wrap anything that does not fit a set common architecture.
Show JSON schema
{
"title": "CustomModelConfig",
"description": "Wrap anything that does not fit a set common architecture.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "custom",
"default": "custom",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
field architecture : Literal['custom'] = 'custom'
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architecture
Transformer
pydantic model eole.config.models.TransformerConfig
Bases: Config
This base TransformerConfig class regroups parameters than can both be set at model level or either encoder/decoder level. BaseModelConfig._override_values validator overrides encoder/decoder values with model values if relevant.
Show JSON schema
{
"title": "TransformerConfig",
"description": "This base TransformerConfig class regroups parameters than can\nboth be set at model level or either encoder/decoder level.\nBaseModelConfig._override_values validator overrides\nencoder/decoder values with model values if relevant.",
"type": "object",
"properties": {
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
add_ffnbias (bool)add_final_linear_bias (bool)add_key_bias (bool | None)add_qkvbias (bool)attn_scaling (float | None)ffn_layernorm (bool)first_k_dense_replace (int)head_dim (int | None)heads (int)heads_kv (int | None)interpolate_mode (str | None)key_norm (bool)layer_norm (Literal['standard', 'standardFP32', 'rms', 'gemma-rms'])mlp_activation_fn (eole.constants.ActivationFunction)moe_renormalize (bool)moe_softmax_after (bool)moe_transformer_ff (int | None)n_positions (int | None)norm_eps (float)num_experts (int)num_experts_per_tok (int)num_shared_experts (int)parallel_residual (bool)position_encoding_type (eole.constants.PositionEncodingType | None)q_gating (bool)qk_norm_post_rope (bool)query_norm (bool)relative_positions_buckets (int)rope_config (eole.config.models.RotaryPositionConfig | None)shared_expert_gate (bool)shared_layer_norm (bool)sliding_window (int)transformer_ff (int)
- Validators:
_validate_transformer_config»all fields
field add_ffnbias : bool = False
Add bias to nn.Linear of MLP FFN.
- Validated by:
_validate_transformer_config
field add_final_linear_bias : bool = False
Add bias to nn.Linear of final_linear in MHA.
- Validated by:
_validate_transformer_config
field add_key_bias : bool | None = None
Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.
- Validated by:
_validate_transformer_config
field add_qkvbias : bool = False
Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with add_final_linear_bias.
- Validated by:
_validate_transformer_config
field attn_scaling : float | None = None
Attention scaling factor, when None uses 1/sqrt(head_dim) by default
- Validated by:
_validate_transformer_config
field ffn_layernorm : bool = False
Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.
- Validated by:
_validate_transformer_config
field first_k_dense_replace : int = 0
Number of layers using Dense instead of MoE
- Validated by:
_validate_transformer_config
field head_dim : int | None = None
Head dimension when this needs to be different vs hidden_size // heads
- Validated by:
_validate_transformer_config
field heads : int = 8
Number of heads for transformer self-attention.
- Validated by:
_validate_transformer_config
field heads_kv : int | None = None
Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)
- Validated by:
_validate_transformer_config
field interpolate_mode : str | None = None
Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., ‘bilinear’): position_embedding is a learned grid using interpolation. (see Vision.py encoder)
- Validated by:
_validate_transformer_config
field key_norm : bool = False
- Validated by:
_validate_transformer_config
field layer_norm : Literal['standard', 'standardFP32', 'rms', 'gemma-rms'] = 'standard'
Type of layer normalization in transformer architecture.
- Validated by:
_validate_transformer_config
field mlp_activation_fn : ActivationFunction = ActivationFunction.relu
The activation function to use in MLP layer.
- Validated by:
_validate_transformer_config
field moe_renormalize : bool = False
Qwen renormalize expert weights after softmax.
- Validated by:
_validate_transformer_config
field moe_softmax_after : bool = False
Usually softmax is before topk, Mixtral does it after.
- Validated by:
_validate_transformer_config
field moe_transformer_ff : int | None = None
Size of hidden moe transformer feed-forward.
- Validated by:
_validate_transformer_config
field n_positions : int | None = None
Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative
- Validated by:
_validate_transformer_config
field norm_eps : float = 1e-05
Layer norm epsilon.
- Validated by:
_validate_transformer_config
field num_experts : int = 0
Number of experts for MoE models.
- Validated by:
_validate_transformer_config
field num_experts_per_tok : int = 2
Number of experts per token.
- Validated by:
_validate_transformer_config
field num_shared_experts : int = 0
Number of shared experts for MoE models (DeepSeekv2).
- Validated by:
_validate_transformer_config
field parallel_residual : bool = False
Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.
- Validated by:
_validate_transformer_config
field position_encoding_type : PositionEncodingType | None = PositionEncodingType.SinusoidalInterleaved
Type of positional encoding.
- Validated by:
_validate_transformer_config
field q_gating : bool = False
Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).
- Validated by:
_validate_transformer_config
field qk_norm_post_rope : bool = False
- Validated by:
_validate_transformer_config
field query_norm : bool = False
- Validated by:
_validate_transformer_config
field relative_positions_buckets : int = 0
Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).
- Validated by:
_validate_transformer_config
field rope_config : RotaryPositionConfig | None = None
Rotary position config, if relevant.
- Validated by:
_validate_transformer_config
field shared_expert_gate : bool = False
Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).
- Validated by:
_validate_transformer_config
field shared_layer_norm : bool = False
Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.
- Validated by:
_validate_transformer_config
field sliding_window : int = 0
Sliding window for transformer self-attention.
- Validated by:
_validate_transformer_config
field transformer_ff : int = 2048
Size of hidden transformer feed-forward.
- Validated by:
_validate_transformer_config
property dim_per_head : int
pydantic model eole.config.models.TransformerEncoderConfig
Bases: TransformerConfig, EncoderConfig
Show JSON schema
{
"title": "TransformerEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
field encoder_type : Literal['transformer'] = 'transformer'
- Validated by:
_validate_transformer_config
pydantic model eole.config.models.TransformerDecoderConfig
Bases: TransformerConfig, DecoderConfig
Show JSON schema
{
"title": "TransformerDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
LM_type (Literal['causal', 'prefix'])aan_useffn (bool)alignment_heads (int)alignment_layer (int)decoder_type (Literal['transformer'])full_context_alignment (bool)lambda_align (float)layer_types (List[str] | None)linear_conv_kernel_dim (int)linear_key_head_dim (int)linear_num_key_heads (int)linear_num_value_heads (int)linear_value_head_dim (int)
- Validators:
_validate_transformer_decoder_config»all fields
field LM_type : Literal['causal', 'prefix'] = 'causal'
TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field aan_useffn : bool = False
Turn on the FFN layer in the AAN decoder.
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field alignment_heads : int = 0
Number of cross attention heads per layer to supervise with.
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field alignment_layer : int = -2
Layer number which has to be supervised.
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field decoder_type : Literal['transformer'] = 'transformer'
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field full_context_alignment : bool = False
Whether alignment is conditioned on full target context.
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field lambda_align : float = 0.0
Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field layer_types : List[str] | None = None
Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: ‘full_attention’, ‘linear_attention’. When None, all layers use full attention.
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field linear_conv_kernel_dim : int = 4
Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field linear_key_head_dim : int = 128
Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field linear_num_key_heads : int = 16
Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field linear_num_value_heads : int = 32
Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
field linear_value_head_dim : int = 128
Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).
- Validated by:
_validate_decoder_config_validate_transformer_config_validate_transformer_decoder_config
pydantic model eole.config.models.TransformerModelConfig
Bases: TransformerConfig, BaseModelConfig
Facilitate setting some transformer specific params at model level.
Show JSON schema
{
"title": "TransformerModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "transformer",
"default": "transformer",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
_validate_transformer»all fieldsdefault_architecture»all fieldsencoder_decoder_type»all fields
field architecture : Literal['transformer'] = 'transformer'
- Validated by:
_override_values_validate_model_config_validate_transformer_validate_transformer_configdefault_architectureencoder_decoder_type
validator default_architecture » all fields
validator encoder_decoder_type » all fields
pydantic model eole.config.models.TransformerLMModelConfig
Bases: TransformerConfig, BaseModelConfig
Facilitate setting some transformer specific params at model level.
Show JSON schema
{
"title": "TransformerLMModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder",
"type": "null"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "transformer_lm",
"default": "transformer_lm",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
_validate_transformer»all fieldsdefault_architecture»all fieldsencoder_decoder_type»all fieldsstr_to_dict»encoder
field architecture : Literal['transformer_lm'] = 'transformer_lm'
- Validated by:
_override_values_validate_model_config_validate_transformer_validate_transformer_configdefault_architectureencoder_decoder_type
field encoder : None = None
Major parameters of an encoder.
- Validated by:
_override_values_validate_model_config_validate_transformer_validate_transformer_configdefault_architectureencoder_decoder_typestr_to_dict
validator default_architecture » all fields
validator encoder_decoder_type » all fields
pydantic model eole.config.models.TransformerEncoderModelConfig
Bases: TransformerConfig, BaseModelConfig
Facilitate setting some transformer specific params at model level.
Show JSON schema
{
"title": "TransformerEncoderModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder",
"type": "null"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "transformer_encoder",
"default": "transformer_encoder",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
_validate_transformer»all fieldsdefault_architecture»all fieldsencoder_decoder_type»all fieldsstr_to_dict»decoder
field architecture : Literal['transformer_encoder'] = 'transformer_encoder'
- Validated by:
_override_values_validate_model_config_validate_transformer_validate_transformer_configdefault_architectureencoder_decoder_type
field decoder : None = None
Major parameters of a decoder.
- Validated by:
_override_values_validate_model_config_validate_transformer_validate_transformer_configdefault_architectureencoder_decoder_typestr_to_dict
validator default_architecture » all fields
validator encoder_decoder_type » all fields
pydantic model eole.config.models.VisionTransformerLMModelConfig
Bases: TransformerConfig, BaseModelConfig
Show JSON schema
{
"title": "VisionTransformerLMModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "vision_transformer_lm",
"default": "vision_transformer_lm",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"adapter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "llava",
"description": "Adapter type to use in the model.",
"title": "Adapter"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
_validate_vision_transformer»all fieldsdefault_architecture»all fieldsencoder_decoder_type»all fields
field adapter : str | None = 'llava'
Adapter type to use in the model.
- Validated by:
_override_values_validate_model_config_validate_transformer_config_validate_vision_transformerdefault_architectureencoder_decoder_type
field architecture : Literal['vision_transformer_lm'] = 'vision_transformer_lm'
- Validated by:
_override_values_validate_model_config_validate_transformer_config_validate_vision_transformerdefault_architectureencoder_decoder_type
validator default_architecture » all fields
validator encoder_decoder_type » all fields
property image_size
property patch_size
pydantic model eole.config.models.WhisperModelConfig
Bases: TransformerConfig, BaseModelConfig
Configuration for Whisper speech-to-text models.
Show JSON schema
{
"title": "WhisperModelConfig",
"description": "Configuration for Whisper speech-to-text models.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "whisper",
"default": "whisper",
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"suppress_tokens": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of token IDs to suppress during audio decoding.",
"title": "Suppress Tokens"
},
"begin_suppress_tokens": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of token IDs to suppress at the first generated position.",
"title": "Begin Suppress Tokens"
},
"no_timestamps_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Token ID for the no-timestamps token in audio models.",
"title": "No Timestamps Token Id"
},
"word_timestamp_heads": {
"anyOf": [
{
"items": {
"items": {
"type": "integer"
},
"type": "array"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "List of [layer, head] pairs for word-level timestamp extraction (mapped from alignment_heads in HF generation_config).",
"title": "Word Timestamp Heads"
},
"median_filter_width": {
"default": 7,
"description": "Median filter width for word-level timestamp smoothing.",
"title": "Median Filter Width",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
default_architecture»all fieldsencoder_decoder_type»all fields
field architecture : Literal['whisper'] = 'whisper'
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architectureencoder_decoder_type
field begin_suppress_tokens : List[int] | None = None
List of token IDs to suppress at the first generated position.
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architectureencoder_decoder_type
field median_filter_width : int = 7
Median filter width for word-level timestamp smoothing.
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architectureencoder_decoder_type
field no_timestamps_token_id : int | None = None
Token ID for the no-timestamps token in audio models.
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architectureencoder_decoder_type
field suppress_tokens : List[int] | None = None
List of token IDs to suppress during audio decoding.
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architectureencoder_decoder_type
field word_timestamp_heads : List[List[int]] | None = None
List of [layer, head] pairs for word-level timestamp extraction (mapped from alignment_heads in HF generation_config).
- Validated by:
_override_values_validate_model_config_validate_transformer_configdefault_architectureencoder_decoder_type
validator default_architecture » all fields
validator encoder_decoder_type » all fields
Vision / Audio Encoders
pydantic model eole.config.models.VisionEncoderConfig
Bases: TransformerConfig, EncoderConfig
Based on mistral-community/pixtral-12b, might evolve later.
Show JSON schema
{
"title": "VisionEncoderConfig",
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"type": "object",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
encoder_sam (bool)encoder_type (Literal['vision'])image_size (int | None)image_token_id (int | None)image_token_id_list (List[int] | None)layernorm_post (bool)layernorm_pre (bool)mm_tokens_per_image (int | None)num_channels (int | None)num_position_embeddings (int | None)patch_conv_bias (bool)patch_size (int | None)temporal_patch_size (int)use_class_embedding (bool)
- Validators:
field encoder_sam : bool = False
- Validated by:
_validate_transformer_config
field encoder_type : Literal['vision'] = 'vision'
- Validated by:
_validate_transformer_config
field image_size : int | None = 1024
- Validated by:
_validate_transformer_config
field image_token_id : int | None = 10
- Validated by:
_validate_transformer_config
field image_token_id_list : List[int] | None = None
includes other image_token ids
- Validated by:
_validate_transformer_config
field layernorm_post : bool = False
- Validated by:
_validate_transformer_config
field layernorm_pre : bool = True
- Validated by:
_validate_transformer_config
field mm_tokens_per_image : int | None = 256
- Validated by:
_validate_transformer_config
field num_channels : int | None = 3
- Validated by:
_validate_transformer_config
field num_position_embeddings : int | None = None
Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48×48). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.
- Validated by:
_validate_transformer_config
field patch_conv_bias : bool = False
- Validated by:
_validate_transformer_config
field patch_size : int | None = 16
- Validated by:
_validate_transformer_config
field temporal_patch_size : int = 1
Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).
- Validated by:
_validate_transformer_config
field use_class_embedding : bool = False
- Validated by:
_validate_transformer_config
property data_type : str
pydantic model eole.config.models.AudioEncoderConfig
Bases: TransformerConfig, EncoderConfig
Configuration for audio encoder.
Show JSON schema
{
"title": "AudioEncoderConfig",
"description": "Configuration for audio encoder.",
"type": "object",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
field chunk_length : int = 30
Audio chunk length in seconds.
- Validated by:
_validate_transformer_config
field encoder_type : Literal['audio'] = 'audio'
- Validated by:
_validate_transformer_config
field hop_length : int = 160
Hop length for mel spectrogram.
- Validated by:
_validate_transformer_config
field max_source_positions : int = 1500
Maximum number of source positions (time frames after conv stem).
- Validated by:
_validate_transformer_config
field n_fft : int = 400
FFT window size for mel spectrogram.
- Validated by:
_validate_transformer_config
field num_mel_bins : int = 80
Number of mel spectrogram bins.
- Validated by:
_validate_transformer_config
field position_encoding_type : PositionEncodingType | None = None
- Validated by:
_validate_transformer_config
field sample_rate : int = 16000
Audio sample rate in Hz.
- Validated by:
_validate_transformer_config
field timestamp_resolution : float = 0.02
Time resolution per timestamp token in seconds.
- Validated by:
_validate_transformer_config
property data_type : str
RNN
pydantic model eole.config.models.RnnConfig
Bases: Config
Just to facilitate testing discriminator stuff.
Show JSON schema
{
"title": "RnnConfig",
"description": "Just to facilitate testing discriminator stuff.",
"type": "object",
"properties": {
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
field bridge : bool = False
Have an additional layer between the last encoder state and the first decoder state (RNN specific).
field rnn_type : Literal['LSTM', 'GRU'] = 'LSTM'
The gate type to use in the RNNs.
pydantic model eole.config.models.RnnEncoderConfig
Bases: RnnConfig, EncoderConfig
Show JSON schema
{
"title": "RnnEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
field encoder_type : Literal['rnn', 'brnn'] = 'rnn'
pydantic model eole.config.models.RnnDecoderConfig
Bases: RnnConfig, DecoderConfig
Show JSON schema
{
"title": "RnnDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
field bidirectional_encoder : bool | None = False
- Validated by:
_validate_decoder_config
field context_gate : Literal['source', 'target', 'both', None] = None
Type of context gate to use.
- Validated by:
_validate_decoder_config
field decoder_type : Literal['rnn'] = 'rnn'
- Validated by:
_validate_decoder_config
pydantic model eole.config.models.RnnModelConfig
Bases: RnnConfig, BaseModelConfig
Show JSON schema
{
"title": "RnnModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "rnn",
"default": "rnn",
"title": "Architecture",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
default_architecture»all fieldsencoder_decoder_type»all fields
field architecture : Literal['rnn'] = 'rnn'
- Validated by:
_override_values_validate_model_configdefault_architectureencoder_decoder_type
validator default_architecture » all fields
validator encoder_decoder_type » all fields
CNN
pydantic model eole.config.models.CnnConfig
Bases: Config
Show JSON schema
{
"title": "CnnConfig",
"type": "object",
"properties": {
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
field cnn_kernel_width : int = 3
Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.
pydantic model eole.config.models.CnnEncoderConfig
Bases: CnnConfig, EncoderConfig
Show JSON schema
{
"title": "CnnEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
field encoder_type : Literal['cnn'] = 'cnn'
pydantic model eole.config.models.CnnDecoderConfig
Bases: CnnConfig, DecoderConfig
Show JSON schema
{
"title": "CnnDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
field decoder_type : Literal['cnn'] = 'cnn'
- Validated by:
_validate_decoder_config
pydantic model eole.config.models.CnnModelConfig
Bases: CnnConfig, BaseModelConfig
Show JSON schema
{
"title": "CnnModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"audio": "#/$defs/AudioEncoderConfig",
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig",
"vision": "#/$defs/VisionEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
},
{
"$ref": "#/$defs/VisionEncoderConfig"
},
{
"$ref": "#/$defs/AudioEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"adapter_bias": {
"default": false,
"description": "Control whether or not the adapter module has bias weights.",
"title": "Adapter Bias",
"type": "boolean"
},
"projector_activation_fn": {
"anyOf": [
{
"$ref": "#/$defs/ActivationFunction"
},
{
"type": "null"
}
],
"default": "relu",
"description": "The activation function to use in adapter projector layer."
},
"spatial_merge_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Control the presence and size of patch merger (Mistral3)",
"title": "Spatial Merge Size"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"estimator_type": {
"default": "average",
"description": "Which hidden_states to use to feed the estimator",
"enum": [
"average",
"last_token",
"first_token"
],
"title": "Estimator Type",
"type": "string"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Original huggingface model.",
"title": "Huggingface Model"
},
"eole_version": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "0.5.2",
"description": "Eole version used to convert/train/save the model.",
"title": "Eole Version"
},
"architecture": {
"const": "cnn",
"default": "cnn",
"title": "Architecture",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"gelu-tanh",
"quick_gelu",
"silu",
"gated-gelu",
"gated-gelu-tanh",
"gated-silu",
"fused-gated-gelu",
"fused-gated-gelu-tanh",
"fused-gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"AudioEncoderConfig": {
"additionalProperties": false,
"description": "Configuration for audio encoder.",
"properties": {
"encoder_type": {
"const": "audio",
"default": "audio",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": null
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_mel_bins": {
"default": 80,
"description": "Number of mel spectrogram bins.",
"title": "Num Mel Bins",
"type": "integer"
},
"max_source_positions": {
"default": 1500,
"description": "Maximum number of source positions (time frames after conv stem).",
"title": "Max Source Positions",
"type": "integer"
},
"sample_rate": {
"default": 16000,
"description": "Audio sample rate in Hz.",
"title": "Sample Rate",
"type": "integer"
},
"chunk_length": {
"default": 30,
"description": "Audio chunk length in seconds.",
"title": "Chunk Length",
"type": "integer"
},
"n_fft": {
"default": 400,
"description": "FFT window size for mel spectrogram.",
"title": "N Fft",
"type": "integer"
},
"hop_length": {
"default": 160,
"description": "Hop length for mel spectrogram.",
"title": "Hop Length",
"type": "integer"
},
"timestamp_resolution": {
"default": 0.02,
"description": "Time resolution per timestamp token in seconds.",
"title": "Timestamp Resolution",
"type": "number"
}
},
"title": "AudioEncoderConfig",
"type": "object"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
},
"normalize": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Enable embeddings scaling. Not always necessary, but useful for some model compatibility, e.g. gemma. https://datascience.stackexchange.com/a/87909",
"title": "Normalize"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True= Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "alpha factor by which to scale rope theta.",
"title": "Alpha"
},
"xdrope_section": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Sections for XDRope mappings",
"title": "Xdrope Section"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
},
"rotary_theta_local": {
"default": 10000,
"description": "Rotary theta base length for local rotary layers",
"title": "Rotary Theta Local",
"type": "integer"
},
"interleave_local": {
"default": 0,
"description": "Local rotary layers each 1/N layers",
"title": "Interleave Local",
"type": "integer"
},
"tmax_index": {
"default": 0,
"description": "tmax indexing, 0 for all cases except gemma 3 = 1",
"title": "Tmax Index",
"type": "integer"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"with_cross_attn": {
"default": false,
"description": "Decoder uses cross-attention with encoder outputs.",
"title": "With Cross Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
},
"LM_type": {
"default": "causal",
"description": "TransformerDecoder LM type (causal = classic, or prefix LM https://arxiv.org/pdf/2308.06912)",
"enum": [
"causal",
"prefix"
],
"title": "Lm Type",
"type": "string"
},
"layer_types": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Per-layer types for hybrid architectures (e.g. Qwen3.5). Supported values: 'full_attention', 'linear_attention'. When None, all layers use full attention.",
"title": "Layer Types"
},
"linear_conv_kernel_dim": {
"default": 4,
"description": "Convolution kernel size for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Conv Kernel Dim",
"type": "integer"
},
"linear_key_head_dim": {
"default": 128,
"description": "Key head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Key Head Dim",
"type": "integer"
},
"linear_value_head_dim": {
"default": 128,
"description": "Value head dimension for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Value Head Dim",
"type": "integer"
},
"linear_num_key_heads": {
"default": 16,
"description": "Number of key heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Key Heads",
"type": "integer"
},
"linear_num_value_heads": {
"default": 32,
"description": "Number of value heads for linear attention layers (Qwen3.5 GatedDeltaNet).",
"title": "Linear Num Value Heads",
"type": "integer"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"VisionEncoderConfig": {
"additionalProperties": false,
"description": "Based on mistral-community/pixtral-12b, might evolve later.",
"properties": {
"encoder_type": {
"const": "vision",
"default": "vision",
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"moe_transformer_ff": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of hidden moe transformer feed-forward.",
"title": "Moe Transformer Ff"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"standardFP32",
"rms",
"gemma-rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-05,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"ffn_layernorm": {
"default": false,
"description": "Add pre/post_feedforward_layernorm around MLP forward. Note: introduced for gemma2 support.",
"title": "Ffn Layernorm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too by default. Can be disabled with `add_final_linear_bias`.",
"title": "Add Qkvbias",
"type": "boolean"
},
"add_key_bias": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": null,
"description": "Add bias to Key projection in MHA. Defaults to add_qkvbias when not set. Set to False for models like Whisper where K has no bias.",
"title": "Add Key Bias"
},
"query_norm": {
"default": false,
"title": "Query Norm",
"type": "boolean"
},
"key_norm": {
"default": false,
"title": "Key Norm",
"type": "boolean"
},
"qk_norm_post_rope": {
"default": false,
"title": "Qk Norm Post Rope",
"type": "boolean"
},
"add_final_linear_bias": {
"default": false,
"description": "Add bias to nn.Linear of final_linear in MHA.",
"title": "Add Final Linear Bias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"attn_scaling": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": null,
"description": "Attention scaling factor, when None uses 1/sqrt(head_dim) by default",
"title": "Attn Scaling"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_shared_experts": {
"default": 0,
"description": "Number of shared experts for MoE models (DeepSeekv2).",
"title": "Num Shared Experts",
"type": "integer"
},
"shared_expert_gate": {
"default": false,
"description": "Apply sigmoid-gated shared expert output (Qwen3.5 MoE style). When True, a linear gate is applied: output += sigmoid(gate(x)) * shared_expert(x).",
"title": "Shared Expert Gate",
"type": "boolean"
},
"first_k_dense_replace": {
"default": 0,
"description": "Number of layers using Dense instead of MoE",
"title": "First K Dense Replace",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"moe_softmax_after": {
"default": false,
"description": "Usually softmax is before topk, Mixtral does it after.",
"title": "Moe Softmax After",
"type": "boolean"
},
"moe_renormalize": {
"default": false,
"description": "Qwen renormalize expert weights after softmax.",
"title": "Moe Renormalize",
"type": "boolean"
},
"q_gating": {
"default": false,
"description": "Enable gated query in attention (Qwen3.5 style). Q projection has doubled output size; output is multiplied by sigmoid(gate).",
"title": "Q Gating",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"interpolate_mode": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Interpolation mode for position embeddings. If None: position_embeddings is a lookup table based on n_positions. If string (e.g., 'bilinear'): position_embedding is a learned grid using interpolation. (see Vision.py encoder)",
"title": "Interpolate Mode"
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"num_channels": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 3,
"title": "Num Channels"
},
"image_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1024,
"title": "Image Size"
},
"patch_size": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 16,
"title": "Patch Size"
},
"image_token_id": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 10,
"title": "Image Token Id"
},
"image_token_id_list": {
"anyOf": [
{
"items": {
"type": "integer"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "includes other image_token ids",
"title": "Image Token Id List"
},
"mm_tokens_per_image": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 256,
"title": "Mm Tokens Per Image"
},
"layernorm_pre": {
"default": true,
"title": "Layernorm Pre",
"type": "boolean"
},
"layernorm_post": {
"default": false,
"title": "Layernorm Post",
"type": "boolean"
},
"patch_conv_bias": {
"default": false,
"title": "Patch Conv Bias",
"type": "boolean"
},
"encoder_sam": {
"default": false,
"title": "Encoder Sam",
"type": "boolean"
},
"use_class_embedding": {
"default": false,
"title": "Use Class Embedding",
"type": "boolean"
},
"temporal_patch_size": {
"default": 1,
"description": "Temporal kernel size for Conv3D patch embedding. When >1 a nn.Conv3d is used (e.g. Qwen3.5 VL uses 2).",
"title": "Temporal Patch Size",
"type": "integer"
},
"num_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Size of the absolute position embedding table (Qwen3.5 VL uses 2304 = 48\u00d748). When set together with position_encoding_type=Rotary both absolute embeddings and 2D RoPE are applied.",
"title": "Num Position Embeddings"
}
},
"title": "VisionEncoderConfig",
"type": "object"
}
},
"additionalProperties": false
}
- Config:
- validate_assignment: bool = True
- validate_default: bool = True
- use_enum_values: bool = True
- extra: str = forbid
- protected_namespaces: tuple = ()
- Fields:
- Validators:
default_architecture»all fieldsencoder_decoder_type»all fields
field architecture : Literal['cnn'] = 'cnn'
- Validated by:
_override_values_validate_model_configdefault_architectureencoder_decoder_type