Skip to main content

Models

Base Configsโ€‹

pydantic model eole.config.models.BaseModelConfig[source]โ€‹

Bases: Config

Show JSON schema
{
"title": "BaseModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig",
"transformer_lm": "#/$defs/TransformerLMDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/TransformerLMDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"architecture": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Architecture"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"enum": [
"mean"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"TransformerLMDecoderConfig": {
"additionalProperties": false,
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerLMDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field add_estimator : bool = Falseโ€‹

Add estimator layer

field architecture : str | None = Noneโ€‹

field decoder : TransformerDecoderConfig | TransformerLMDecoderConfig | RnnDecoderConfig | CnnDecoderConfig | None = Noneโ€‹

Major parameters of a decoder.

field embeddings : EmbeddingsConfig [Optional]โ€‹

Contains most of the args useful to build the Embeddings module.

field encoder : TransformerEncoderConfig | RnnEncoderConfig | CnnEncoderConfig | MeanEncoderConfig | None = Noneโ€‹

Major parameters of an encoder.

field generator_bias : bool = Trueโ€‹

Control whether or not the generator Linear module has bias weights.

field generator_function : Literal['softmax', 'sparsemax'] = 'softmax'โ€‹

Which function to use for generating probabilities over the target vocabulary.

field hidden_size : int = -1โ€‹

Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.

field input_feed : int = 1โ€‹

Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.

field layers : int = -1โ€‹

Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).

field left_pad : bool = Falseโ€‹

Enable left-padding, useful for some LLMs.

field share_decoder_embeddings : bool = Falseโ€‹

Use a share weight matrix for the input and output word embeddings in the decoder.

field share_embeddings : bool = Falseโ€‹

Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.

field transformer_ff : int = -1โ€‹

Size of hidden transformer feed-forward.

field word_vec_size : int = -1โ€‹

Word embedding size for src and tgt.

validator default_architecture ยป all fields[source]โ€‹

validator str_to_dict ยป embeddings , encoder , decoder[source]โ€‹

update_model_opts()[source]โ€‹

property model_type : ModelType[source]โ€‹

pydantic model eole.config.models.EmbeddingsConfig[source]โ€‹

Bases: Config

Show JSON schema
{
"title": "EmbeddingsConfig",
"type": "object",
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"$defs": {
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
}
},
"additionalProperties": false
}

field freeze_word_vecs_dec : bool = Falseโ€‹

Freeze word embeddings on the encoder side.

field freeze_word_vecs_enc : bool = Falseโ€‹

Freeze word embeddings on the encoder side.

field n_positions : int | None = Noneโ€‹

Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative

field position_encoding : bool = Falseโ€‹

Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.

field position_encoding_type : PositionEncodingType | None = PositionEncodingType.SinusoidalInterleavedโ€‹

Type of positional encoding.

field position_shift : int | None = 0โ€‹

Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl

field src_word_vec_size : int = 512โ€‹

Word embedding size for src.

field tgt_word_vec_size : int = 512โ€‹

Word embedding size for tgt.

field word_vec_size : int = -1โ€‹

Word embedding size for src and tgt.

validator validate_embeddings ยป all fields[source]โ€‹

pydantic model eole.config.models.EncoderConfig[source]โ€‹

Bases: Config

Abstract class for all encoders

Show JSON schema
{
"title": "EncoderConfig",
"description": "Abstract class for all encoders",
"type": "object",
"properties": {
"encoder_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "rnn",
"description": "Type of encoder layer(s) to use.",
"title": "Encoder Type"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"additionalProperties": false
}

field encoder_type : str | None = 'rnn'โ€‹

Type of encoder layer(s) to use.

field hidden_size : int = 512โ€‹

Size of encoder hidden states.

field layers : int = 2โ€‹

Number of layers in the encoder.

field src_word_vec_size : int = 512โ€‹

Word embedding size for src.

pydantic model eole.config.models.DecoderConfig[source]โ€‹

Bases: Config

Abstract class for all decoders

Show JSON schema
{
"title": "DecoderConfig",
"description": "Abstract class for all decoders",
"type": "object",
"properties": {
"decoder_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "rnn",
"description": "Type of decoder layer(s) to use.",
"title": "Decoder Type"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
}
},
"additionalProperties": false
}

field coverage_attn : bool = Falseโ€‹

Train a coverage attention layer.

  • Validated by:
    • _validate_decoder_config

field decoder_type : str | None = 'rnn'โ€‹

Type of decoder layer(s) to use.

  • Validated by:
    • _validate_decoder_config

field global_attention : Literal['dot', 'general', 'mlp', None] = 'general'โ€‹

The attention type to use. (Luong=general, Bahdanau=MLP)

  • Validated by:
    • _validate_decoder_config

field global_attention_function : Literal['softmax', 'sparsemax'] = 'softmax'โ€‹

Global attention function to use.

  • Validated by:
    • _validate_decoder_config

field hidden_size : int = 512โ€‹

Size of decoder hidden states.

  • Validated by:
    • _validate_decoder_config

field lambda_coverage : float = 0.0โ€‹

Lambda value for coverage loss of See et al (2017)

  • Validated by:
    • _validate_decoder_config

field layers : int = 2โ€‹

Number of layers in the decoder.

  • Validated by:
    • _validate_decoder_config

field tgt_word_vec_size : int = 512โ€‹

Word embedding size for tgt.

  • Validated by:
    • _validate_decoder_config

pydantic model eole.config.models.CustomModelConfig[source]โ€‹

Bases: BaseModelConfig

Wrap anything that does not fit a set common architecture.

Show JSON schema
{
"title": "CustomModelConfig",
"description": "Wrap anything that does not fit a set common architecture.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig",
"transformer_lm": "#/$defs/TransformerLMDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/TransformerLMDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"architecture": {
"const": "custom",
"default": "custom",
"enum": [
"custom"
],
"title": "Architecture",
"type": "string"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"enum": [
"mean"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"TransformerLMDecoderConfig": {
"additionalProperties": false,
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerLMDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field architecture : Literal['custom'] = 'custom'โ€‹

Transformerโ€‹

pydantic model eole.config.models.TransformerConfig[source]โ€‹

Bases: Config

This base TransformerConfig class regroups parameters than can both be set at model level or either encoder/decoder level. BaseModelConfig._override_values validator overrides encoder/decoder values with model values if relevant.

Show JSON schema
{
"title": "TransformerConfig",
"description": "This base TransformerConfig class regroups parameters than can\nboth be set at model level or either encoder/decoder level.\nBaseModelConfig._override_values validator overrides\nencoder/decoder values with model values if relevant.",
"type": "object",
"properties": {
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

field add_ffnbias : bool = Falseโ€‹

Add bias to nn.Linear of MLP FFN.

  • Validated by:
    • _validate_transformer_config

field add_qkvbias : bool = Falseโ€‹

Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.

  • Validated by:
    • _validate_transformer_config

field head_dim : int | None = Noneโ€‹

Head dimension when this needs to be different vs hidden_size // heads

  • Validated by:
    • _validate_transformer_config

field heads : int = 8โ€‹

Number of heads for transformer self-attention.

  • Validated by:
    • _validate_transformer_config

field heads_kv : int | None = Noneโ€‹

Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)

  • Validated by:
    • _validate_transformer_config

field layer_norm : Literal['standard', 'rms'] = 'standard'โ€‹

Type of layer normalization in transformer architecture.

  • Validated by:
    • _validate_transformer_config

field mlp_activation_fn : ActivationFunction = ActivationFunction.reluโ€‹

The activation function to use in MLP layer.

  • Validated by:
    • _validate_transformer_config

field n_positions : int | None = Noneโ€‹

Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative

  • Validated by:
    • _validate_transformer_config

field norm_eps : float = 1e-06โ€‹

Layer norm epsilon.

  • Validated by:
    • _validate_transformer_config

field num_experts : int = 0โ€‹

Number of experts for MoE models.

  • Validated by:
    • _validate_transformer_config

field num_experts_per_tok : int = 2โ€‹

Number of experts per token.

  • Validated by:
    • _validate_transformer_config

field parallel_residual : bool = Falseโ€‹

Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.

  • Validated by:
    • _validate_transformer_config

field position_encoding_type : PositionEncodingType | None = PositionEncodingType.SinusoidalInterleavedโ€‹

Type of positional encoding.

  • Validated by:
    • _validate_transformer_config

field relative_positions_buckets : int = 0โ€‹

Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).

  • Validated by:
    • _validate_transformer_config

field rope_config : RotaryPositionConfig | None = Noneโ€‹

Rotary position config, if relevant.

  • Validated by:
    • _validate_transformer_config

field shared_layer_norm : bool = Falseโ€‹

Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.

  • Validated by:
    • _validate_transformer_config

field sliding_window : int = 0โ€‹

Sliding window for transformer self-attention.

  • Validated by:
    • _validate_transformer_config

field transformer_ff : int = 2048โ€‹

Size of hidden transformer feed-forward.

  • Validated by:
    • _validate_transformer_config

property dim_per_head : int[source]โ€‹

pydantic model eole.config.models.TransformerEncoderConfig[source]โ€‹

Bases: TransformerConfig, EncoderConfig

Show JSON schema
{
"title": "TransformerEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field encoder_type : Literal['transformer'] = 'transformer'โ€‹

  • Validated by:
    • _validate_transformer_config

pydantic model eole.config.models.TransformerDecoderConfig[source]โ€‹

Bases: TransformerConfig, DecoderConfig

Show JSON schema
{
"title": "TransformerDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

field aan_useffn : bool = Falseโ€‹

Turn on the FFN layer in the AAN decoder.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field alignment_heads : int = 0โ€‹

Number of cross attention heads per layer to supervise with.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field alignment_layer : int = -2โ€‹

Layer number which has to be supervised.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field decoder_type : Literal['transformer'] = 'transformer'โ€‹

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field full_context_alignment : bool = Falseโ€‹

Whether alignment is conditioned on full target context.

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

field lambda_align : float = 0.0โ€‹

Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

pydantic model eole.config.models.TransformerLMDecoderConfig[source]โ€‹

Bases: TransformerDecoderConfig

Right now just wraps TransformerDecoderConfig for simplicity. Might merge in a single class later once TransformerLM path is clarified.

Show JSON schema
{
"title": "TransformerLMDecoderConfig",
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"type": "object",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field decoder_type : Literal['transformer_lm'] = 'transformer_lm'โ€‹

  • Validated by:
    • _validate_decoder_config
    • _validate_transformer_config
    • _validate_transformer_decoder_config

pydantic model eole.config.models.TransformerModelConfig[source]โ€‹

Bases: TransformerConfig, BaseModelConfig

Facilitate setting some transformer specific params at model level.

Show JSON schema
{
"title": "TransformerModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig",
"transformer_lm": "#/$defs/TransformerLMDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/TransformerLMDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"architecture": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"enum": [
"mean"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"TransformerLMDecoderConfig": {
"additionalProperties": false,
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerLMDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['transformer'] = 'transformer'โ€‹

validator default_architecture ยป all fields[source]โ€‹

validator encoder_decoder_type ยป all fields[source]โ€‹

pydantic model eole.config.models.TransformerLMModelConfig[source]โ€‹

Bases: TransformerConfig, BaseModelConfig

Facilitate setting some transformer specific params at model level.

Show JSON schema
{
"title": "TransformerLMModelConfig",
"description": "Facilitate setting some transformer specific params at model level.",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder",
"type": "null"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig",
"transformer_lm": "#/$defs/TransformerLMDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/TransformerLMDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"architecture": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Architecture",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerLMDecoderConfig": {
"additionalProperties": false,
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerLMDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['transformer_lm'] = 'transformer_lm'โ€‹

field encoder : None = Noneโ€‹

Major parameters of an encoder.

validator default_architecture ยป all fields[source]โ€‹

validator encoder_decoder_type ยป all fields[source]โ€‹

RNNโ€‹

pydantic model eole.config.models.RnnConfig[source]โ€‹

Bases: Config

Just to facilitate testing discriminator stuff.

Show JSON schema
{
"title": "RnnConfig",
"description": "Just to facilitate testing discriminator stuff.",
"type": "object",
"properties": {
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"additionalProperties": false
}

field bridge : bool = Falseโ€‹

Have an additional layer between the last encoder state and the first decoder state (RNN specific).

field rnn_type : Literal['LSTM', 'GRU'] = 'LSTM'โ€‹

The gate type to use in the RNNs.

pydantic model eole.config.models.RnnEncoderConfig[source]โ€‹

Bases: RnnConfig, EncoderConfig

Show JSON schema
{
"title": "RnnEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field encoder_type : Literal['rnn', 'brnn'] = 'rnn'โ€‹

pydantic model eole.config.models.RnnDecoderConfig[source]โ€‹

Bases: RnnConfig, DecoderConfig

Show JSON schema
{
"title": "RnnDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"additionalProperties": false
}

field bidirectional_encoder : bool | None = Falseโ€‹

  • Validated by:
    • _validate_decoder_config

field context_gate : Literal['source', 'target', 'both', None] = Noneโ€‹

Type of context gate to use.

  • Validated by:
    • _validate_decoder_config

field decoder_type : Literal['rnn'] = 'rnn'โ€‹

  • Validated by:
    • _validate_decoder_config

pydantic model eole.config.models.RnnModelConfig[source]โ€‹

Bases: RnnConfig, BaseModelConfig

Show JSON schema
{
"title": "RnnModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig",
"transformer_lm": "#/$defs/TransformerLMDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/TransformerLMDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"architecture": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Architecture",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"enum": [
"mean"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"TransformerLMDecoderConfig": {
"additionalProperties": false,
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerLMDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['rnn'] = 'rnn'โ€‹

validator default_architecture ยป all fields[source]โ€‹

validator encoder_decoder_type ยป all fields[source]โ€‹

CNNโ€‹

pydantic model eole.config.models.CnnConfig[source]โ€‹

Bases: Config

Show JSON schema
{
"title": "CnnConfig",
"type": "object",
"properties": {
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field cnn_kernel_width : int = 3โ€‹

Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.

pydantic model eole.config.models.CnnEncoderConfig[source]โ€‹

Bases: CnnConfig, EncoderConfig

Show JSON schema
{
"title": "CnnEncoderConfig",
"type": "object",
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field encoder_type : Literal['cnn'] = 'cnn'โ€‹

pydantic model eole.config.models.CnnDecoderConfig[source]โ€‹

Bases: CnnConfig, DecoderConfig

Show JSON schema
{
"title": "CnnDecoderConfig",
"type": "object",
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:
  • Validators:

field decoder_type : Literal['cnn'] = 'cnn'โ€‹

  • Validated by:
    • _validate_decoder_config

pydantic model eole.config.models.CnnModelConfig[source]โ€‹

Bases: CnnConfig, BaseModelConfig

Show JSON schema
{
"title": "CnnModelConfig",
"type": "object",
"properties": {
"embeddings": {
"$ref": "#/$defs/EmbeddingsConfig",
"description": "Contains most of the args useful to build the Embeddings module."
},
"encoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"brnn": "#/$defs/RnnEncoderConfig",
"cnn": "#/$defs/CnnEncoderConfig",
"mean": "#/$defs/MeanEncoderConfig",
"rnn": "#/$defs/RnnEncoderConfig",
"transformer": "#/$defs/TransformerEncoderConfig"
},
"propertyName": "encoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerEncoderConfig"
},
{
"$ref": "#/$defs/RnnEncoderConfig"
},
{
"$ref": "#/$defs/CnnEncoderConfig"
},
{
"$ref": "#/$defs/MeanEncoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of an encoder.",
"title": "Encoder"
},
"decoder": {
"anyOf": [
{
"discriminator": {
"mapping": {
"cnn": "#/$defs/CnnDecoderConfig",
"rnn": "#/$defs/RnnDecoderConfig",
"transformer": "#/$defs/TransformerDecoderConfig",
"transformer_lm": "#/$defs/TransformerLMDecoderConfig"
},
"propertyName": "decoder_type"
},
"oneOf": [
{
"$ref": "#/$defs/TransformerDecoderConfig"
},
{
"$ref": "#/$defs/TransformerLMDecoderConfig"
},
{
"$ref": "#/$defs/RnnDecoderConfig"
},
{
"$ref": "#/$defs/CnnDecoderConfig"
}
]
},
{
"type": "null"
}
],
"default": null,
"description": "Major parameters of a decoder.",
"title": "Decoder"
},
"hidden_size": {
"default": -1,
"description": "Size of hidden states. Overwrites [encoder/decoder].hidden_size if set.",
"title": "Hidden Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"layers": {
"default": -1,
"description": "Number of layers in both encoder and decoder (will overwrite enc_layers/dec_layers).",
"title": "Layers",
"type": "integer"
},
"transformer_ff": {
"default": -1,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"share_decoder_embeddings": {
"default": false,
"description": "Use a share weight matrix for the input and output word embeddings in the decoder.",
"title": "Share Decoder Embeddings",
"type": "boolean"
},
"share_embeddings": {
"default": false,
"description": "Share the word embeddings between encoder and decoder. Need to use shared vocabulary for this option.",
"title": "Share Embeddings",
"type": "boolean"
},
"input_feed": {
"default": 1,
"description": "Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.",
"title": "Input Feed",
"type": "integer"
},
"generator_function": {
"default": "softmax",
"description": "Which function to use for generating probabilities over the target vocabulary.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Generator Function",
"type": "string"
},
"generator_bias": {
"default": true,
"description": "Control whether or not the generator Linear module has bias weights.",
"title": "Generator Bias",
"type": "boolean"
},
"add_estimator": {
"default": false,
"description": "Add estimator layer",
"title": "Add Estimator",
"type": "boolean"
},
"left_pad": {
"default": false,
"description": "Enable left-padding, useful for some LLMs.",
"title": "Left Pad",
"type": "boolean"
},
"architecture": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Architecture",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"$defs": {
"ActivationFunction": {
"enum": [
"relu",
"gelu",
"silu",
"gated-gelu",
"gated-silu"
],
"title": "ActivationFunction",
"type": "string"
},
"CnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnDecoderConfig",
"type": "object"
},
"CnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "cnn",
"default": "cnn",
"enum": [
"cnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"cnn_kernel_width": {
"default": 3,
"description": "Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in convolution layers.",
"title": "Cnn Kernel Width",
"type": "integer"
}
},
"title": "CnnEncoderConfig",
"type": "object"
},
"EmbeddingsConfig": {
"additionalProperties": false,
"properties": {
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"word_vec_size": {
"default": -1,
"description": "Word embedding size for src and tgt.",
"title": "Word Vec Size",
"type": "integer"
},
"freeze_word_vecs_enc": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Enc",
"type": "boolean"
},
"freeze_word_vecs_dec": {
"default": false,
"description": "Freeze word embeddings on the encoder side.",
"title": "Freeze Word Vecs Dec",
"type": "boolean"
},
"position_encoding": {
"default": false,
"description": "Absolute position encoding, see position_encoding_type. Necessary for non-RNN style models.",
"title": "Position Encoding",
"type": "boolean"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"position_shift": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Positions IDS shift before making position embed dirty patch to cover for xlm-roberta-xl",
"title": "Position Shift"
}
},
"title": "EmbeddingsConfig",
"type": "object"
},
"MeanEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "mean",
"default": "mean",
"enum": [
"mean"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
}
},
"title": "MeanEncoderConfig",
"type": "object"
},
"PositionEncodingType": {
"enum": [
"SinusoidalInterleaved",
"SinusoidalConcat",
"Learned",
"Relative",
"Rotary",
"Alibi"
],
"title": "PositionEncodingType",
"type": "string"
},
"RnnDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "rnn",
"default": "rnn",
"enum": [
"rnn"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
},
"context_gate": {
"default": null,
"description": "Type of context gate to use.",
"enum": [
"source",
"target",
"both",
null
],
"title": "Context Gate"
},
"bidirectional_encoder": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Bidirectional Encoder"
}
},
"title": "RnnDecoderConfig",
"type": "object"
},
"RnnEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"default": "rnn",
"enum": [
"rnn",
"brnn"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"bridge": {
"default": false,
"description": "Have an additional layer between the last encoder state and the first decoder state (RNN specific).",
"title": "Bridge",
"type": "boolean"
},
"rnn_type": {
"default": "LSTM",
"description": "The gate type to use in the RNNs.",
"enum": [
"LSTM",
"GRU"
],
"title": "Rnn Type",
"type": "string"
}
},
"title": "RnnEncoderConfig",
"type": "object"
},
"RotaryPositionConfig": {
"additionalProperties": false,
"description": "Configuration for rotary position embeddings used in transformer models.",
"properties": {
"rotary_interleave": {
"default": true,
"description": "Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. (True=default Llama from Meta (original), False= used by all HuggingFace models)",
"title": "Rotary Interleave",
"type": "boolean"
},
"rotary_theta": {
"default": 10000,
"description": "Rotary theta base length, 1e4 for Llama2.Mistral, 1e6 for Mixtral",
"title": "Rotary Theta",
"type": "integer"
},
"rotary_dim": {
"default": 0,
"description": "Rotary dim when model requires it to be different to head dim.",
"title": "Rotary Dim",
"type": "integer"
},
"scaling_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Specifies the type of RoPE scaling to be applied, if any.",
"title": "Scaling Type"
},
"scaling_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 8.0,
"description": "Factor by which to scale RoPE embeddings.",
"title": "Scaling Factor"
},
"low_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Scaling factor applied to the lower frequency components of RoPE.",
"title": "Low Freq Factor"
},
"high_freq_factor": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 4.0,
"description": "Scaling factor applied to the higher frequency components of RoPE.",
"title": "High Freq Factor"
},
"original_max_position_embeddings": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 8192,
"description": "Original maximum position embeddings for RoPE scaling.",
"title": "Original Max Position Embeddings"
}
},
"title": "RotaryPositionConfig",
"type": "object"
},
"TransformerDecoderConfig": {
"additionalProperties": false,
"properties": {
"decoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerDecoderConfig",
"type": "object"
},
"TransformerEncoderConfig": {
"additionalProperties": false,
"properties": {
"encoder_type": {
"const": "transformer",
"default": "transformer",
"enum": [
"transformer"
],
"title": "Encoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the encoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of encoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"src_word_vec_size": {
"default": 512,
"description": "Word embedding size for src.",
"title": "Src Word Vec Size",
"type": "integer"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
}
},
"title": "TransformerEncoderConfig",
"type": "object"
},
"TransformerLMDecoderConfig": {
"additionalProperties": false,
"description": "Right now just wraps TransformerDecoderConfig for simplicity.\nMight merge in a single class later once TransformerLM path is clarified.",
"properties": {
"decoder_type": {
"const": "transformer_lm",
"default": "transformer_lm",
"enum": [
"transformer_lm"
],
"title": "Decoder Type",
"type": "string"
},
"layers": {
"default": 2,
"description": "Number of layers in the decoder.",
"title": "Layers",
"type": "integer"
},
"hidden_size": {
"default": 512,
"description": "Size of decoder hidden states.",
"title": "Hidden Size",
"type": "integer"
},
"tgt_word_vec_size": {
"default": 512,
"description": "Word embedding size for tgt.",
"title": "Tgt Word Vec Size",
"type": "integer"
},
"coverage_attn": {
"default": false,
"description": "Train a coverage attention layer.",
"title": "Coverage Attn",
"type": "boolean"
},
"lambda_coverage": {
"default": 0.0,
"description": "Lambda value for coverage loss of See et al (2017)",
"title": "Lambda Coverage",
"type": "number"
},
"global_attention": {
"default": "general",
"description": "The attention type to use. (Luong=general, Bahdanau=MLP)",
"enum": [
"dot",
"general",
"mlp",
null
],
"title": "Global Attention"
},
"global_attention_function": {
"default": "softmax",
"description": "Global attention function to use.",
"enum": [
"softmax",
"sparsemax"
],
"title": "Global Attention Function",
"type": "string"
},
"sliding_window": {
"default": 0,
"description": "Sliding window for transformer self-attention.",
"title": "Sliding Window",
"type": "integer"
},
"heads": {
"default": 8,
"description": "Number of heads for transformer self-attention.",
"title": "Heads",
"type": "integer"
},
"transformer_ff": {
"default": 2048,
"description": "Size of hidden transformer feed-forward.",
"title": "Transformer Ff",
"type": "integer"
},
"relative_positions_buckets": {
"default": 0,
"description": "Enable relative position bias (https://github.com/google-research/text-to-text-transfer-transformer).",
"title": "Relative Positions Buckets",
"type": "integer"
},
"mlp_activation_fn": {
"$ref": "#/$defs/ActivationFunction",
"default": "relu",
"description": "The activation function to use in MLP layer."
},
"layer_norm": {
"default": "standard",
"description": "Type of layer normalization in transformer architecture.",
"enum": [
"standard",
"rms"
],
"title": "Layer Norm",
"type": "string"
},
"norm_eps": {
"default": 1e-06,
"description": "Layer norm epsilon.",
"title": "Norm Eps",
"type": "number"
},
"shared_layer_norm": {
"default": false,
"description": "Use a shared layer_norm in parallel residual attention. Note: must be True for Falcon 7B, False for Falcon 40B, same for GPT-J and GPT-NeoX models.",
"title": "Shared Layer Norm",
"type": "boolean"
},
"add_qkvbias": {
"default": false,
"description": "Add bias to nn.Linear of Query/Key/Value in MHA. Note: this will add bias to output projection layer too.",
"title": "Add Qkvbias",
"type": "boolean"
},
"heads_kv": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Number of heads for KV. heads_kv=heads if None, else number of heads for KV(e.g. Falcon 40B)",
"title": "Heads Kv"
},
"head_dim": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Head dimension when this needs to be different vs hidden_size // heads",
"title": "Head Dim"
},
"add_ffnbias": {
"default": false,
"description": "Add bias to nn.Linear of MLP FFN.",
"title": "Add Ffnbias",
"type": "boolean"
},
"parallel_residual": {
"default": false,
"description": "Use parallel residual in decoder layer. Note: this is used by GPT-J / Falcon Architecture.",
"title": "Parallel Residual",
"type": "boolean"
},
"num_experts": {
"default": 0,
"description": "Number of experts for MoE models.",
"title": "Num Experts",
"type": "integer"
},
"num_experts_per_tok": {
"default": 2,
"description": "Number of experts per token.",
"title": "Num Experts Per Tok",
"type": "integer"
},
"position_encoding_type": {
"anyOf": [
{
"$ref": "#/$defs/PositionEncodingType"
},
{
"type": "null"
}
],
"default": "SinusoidalInterleaved",
"description": "Type of positional encoding."
},
"n_positions": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Two casesCase 1: Absolute number of positions to learn position embeddings on (position_encoding_type: Learned)Case 2: Max Relative PositionsIn the case of position_encoding_type: Relative",
"title": "N Positions"
},
"rope_config": {
"anyOf": [
{
"$ref": "#/$defs/RotaryPositionConfig"
},
{
"type": "null"
}
],
"default": null,
"description": "Rotary position config, if relevant."
},
"aan_useffn": {
"default": false,
"description": "Turn on the FFN layer in the AAN decoder.",
"title": "Aan Useffn",
"type": "boolean"
},
"alignment_layer": {
"default": -2,
"description": "Layer number which has to be supervised.",
"title": "Alignment Layer",
"type": "integer"
},
"alignment_heads": {
"default": 0,
"description": "Number of cross attention heads per layer to supervise with.",
"title": "Alignment Heads",
"type": "integer"
},
"full_context_alignment": {
"default": false,
"description": "Whether alignment is conditioned on full target context.",
"title": "Full Context Alignment",
"type": "boolean"
},
"lambda_align": {
"default": 0.0,
"description": "Lambda value for alignement loss of Garg et al, 2019 (https://arxiv.org/abs/1909.02074)",
"title": "Lambda Align",
"type": "number"
}
},
"title": "TransformerLMDecoderConfig",
"type": "object"
}
},
"additionalProperties": false
}

field architecture : Literal['cnn'] = 'cnn'โ€‹

validator default_architecture ยป all fields[source]โ€‹

validator encoder_decoder_type ยป all fields[source]โ€‹