Skip to main content

Data

pydantic model eole.config.data.BaseVocabConfig[source]

Bases: Config

Show JSON schema
{
"title": "BaseVocabConfig",
"type": "object",
"properties": {
"src_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "Path to src (or shared) vocabulary file. Format: one <word> or <word>\t<count> per line.",
"title": "Src Vocab"
},
"tgt_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to tgt vocabulary file. Format: one <word> or <word>\t<count> per line.",
"title": "Tgt Vocab"
},
"share_vocab": {
"default": false,
"description": "Share source and target vocabulary.",
"title": "Share Vocab",
"type": "boolean"
},
"decoder_start_token": {
"default": "&lt;s&gt;",
"description": "Default decoder start token. For most models it is &lt;s&gt; = BOS. Some fairseq models require &lt;/s&gt;.",
"title": "Decoder Start Token",
"type": "string"
},
"bos_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "&lt;s&gt;",
"title": "Bos Token"
},
"eos_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "&lt;/s&gt;",
"title": "Eos Token"
},
"unk_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "<unk>",
"title": "Unk Token"
},
"pad_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "<blank>",
"title": "Pad Token"
},
"both_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for both source and target tokens.",
"title": "Both Embeddings"
},
"src_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for source tokens.",
"title": "Src Embeddings"
},
"tgt_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for target tokens.",
"title": "Tgt Embeddings"
},
"embeddings_type": {
"anyOf": [
{
"enum": [
"GloVe",
"word2vec"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Type of embeddings file.",
"title": "Embeddings Type"
}
},
"additionalProperties": false,
"required": [
"src_vocab"
]
}

field bos_token : str | None = '<s>'

field both_embeddings : str | None = None

Path to the embeddings file to use for both source and target tokens.

field decoder_start_token : str = '<s>'

Default decoder start token. For most models it is <s> = BOS. Some fairseq models require </s>.

field embeddings_type : Literal['GloVe', 'word2vec'] | None = None

Type of embeddings file.

field eos_token : str | None = '</s>'

field pad_token : str | None = ''

field share_vocab : bool = False

Share source and target vocabulary.

field src_embeddings : str | None = None

Path to the embeddings file to use for source tokens.

field src_vocab : str | None [Required]

Path to src (or shared) vocabulary file. Format: one or per line.

field tgt_embeddings : str | None = None

Path to the embeddings file to use for target tokens.

field tgt_vocab : str | None = None

Path to tgt vocabulary file. Format: one or per line.

field unk_token : str | None = ''

pydantic model eole.config.data.VocabConfig[source]

Bases: BaseVocabConfig

Show JSON schema
{
"title": "VocabConfig",
"type": "object",
"properties": {
"src_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "Path to src (or shared) vocabulary file. Format: one <word> or <word>\t<count> per line.",
"title": "Src Vocab"
},
"tgt_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to tgt vocabulary file. Format: one <word> or <word>\t<count> per line.",
"title": "Tgt Vocab"
},
"share_vocab": {
"default": false,
"description": "Share source and target vocabulary.",
"title": "Share Vocab",
"type": "boolean"
},
"decoder_start_token": {
"default": "&lt;s&gt;",
"description": "Default decoder start token. For most models it is &lt;s&gt; = BOS. Some fairseq models require &lt;/s&gt;.",
"title": "Decoder Start Token",
"type": "string"
},
"bos_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "&lt;s&gt;",
"title": "Bos Token"
},
"eos_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "&lt;/s&gt;",
"title": "Eos Token"
},
"unk_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "<unk>",
"title": "Unk Token"
},
"pad_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "<blank>",
"title": "Pad Token"
},
"both_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for both source and target tokens.",
"title": "Both Embeddings"
},
"src_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for source tokens.",
"title": "Src Embeddings"
},
"tgt_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for target tokens.",
"title": "Tgt Embeddings"
},
"embeddings_type": {
"anyOf": [
{
"enum": [
"GloVe",
"word2vec"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Type of embeddings file.",
"title": "Embeddings Type"
},
"src_vocab_size": {
"default": 32758,
"description": "Maximum size of the source vocabulary.",
"title": "Src Vocab Size",
"type": "integer"
},
"tgt_vocab_size": {
"default": 32768,
"description": "Maximum size of the target vocabulary.",
"title": "Tgt Vocab Size",
"type": "integer"
},
"vocab_size_multiple": {
"default": 8,
"description": "Make the vocabulary size a multiple of this value. (Adds dummy tokens if needed.)",
"title": "Vocab Size Multiple",
"type": "integer"
},
"src_words_min_frequency": {
"default": 0,
"description": "Discard source words with lower frequency.",
"title": "Src Words Min Frequency",
"type": "integer"
},
"tgt_words_min_frequency": {
"default": 0,
"description": "Discard target words with lower frequency.",
"title": "Tgt Words Min Frequency",
"type": "integer"
}
},
"additionalProperties": false,
"required": [
"src_vocab"
]
}

field src_vocab_size : int = 32758

Maximum size of the source vocabulary.

field src_words_min_frequency : int = 0

Discard source words with lower frequency.

field tgt_vocab_size : int = 32768

Maximum size of the target vocabulary.

field tgt_words_min_frequency : int = 0

Discard target words with lower frequency.

field vocab_size_multiple : int = 8

Make the vocabulary size a multiple of this value. (Adds dummy tokens if needed.)

pydantic model eole.config.data.Dataset[source]

Bases: Config

Show JSON schema
{
"title": "Dataset",
"type": "object",
"properties": {
"name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Name"
},
"weight": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"title": "Weight"
},
"transforms": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"title": "Transforms"
},
"path_src": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Src"
},
"path_tgt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Tgt"
},
"path_sco": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Sco"
},
"path_txt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Txt"
},
"path_align": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Align"
},
"src_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Src Prefix"
},
"tgt_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Tgt Prefix"
},
"src_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Src Suffix"
},
"tgt_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Tgt Suffix"
},
"src_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Src Lang"
},
"tgt_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Tgt Lang"
},
"penn": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Penn"
},
"norm_quote_commas": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Norm Quote Commas"
},
"norm_numbers": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Norm Numbers"
},
"pre_replace_unicode_punct": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Pre Replace Unicode Punct"
},
"post_remove_control_chars": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Post Remove Control Chars"
},
"src_eq_tgt": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Src Eq Tgt"
},
"same_char": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Same Char"
},
"same_word": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Same Word"
},
"scripts_ok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Latin",
"Common"
],
"title": "Scripts Ok"
},
"scripts_nok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"title": "Scripts Nok"
},
"src_tgt_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 2,
"title": "Src Tgt Ratio"
},
"avg_tok_min": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3,
"title": "Avg Tok Min"
},
"avg_tok_max": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 20,
"title": "Avg Tok Max"
},
"lang_id": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"en",
"fr"
],
"title": "Lang Id"
}
},
"additionalProperties": false
}

field avg_tok_max : float | None = 20

field avg_tok_min : float | None = 3

field lang_id : List[str] | None = ['en', 'fr']

field name : str | None = None

field norm_numbers : bool | None = True

field norm_quote_commas : bool | None = True

field path_align : str | None = None

field path_sco : str | None = None

field path_src : str | None = None

field path_tgt : str | None = None

field path_txt : str | None = None

field penn : bool | None = True

field post_remove_control_chars : bool | None = False

field pre_replace_unicode_punct : bool | None = False

field same_char : bool | None = True

field same_word : bool | None = True

field scripts_nok : List[str] | None = []

field scripts_ok : List[str] | None = ['Latin', 'Common']

field src_eq_tgt : bool | None = True

field src_lang : str | None = None

field src_prefix : str | None = None

field src_suffix : str | None = None

field src_tgt_ratio : float | None = 2

field tgt_lang : str | None = None

field tgt_prefix : str | None = None

field tgt_suffix : str | None = None

field transforms : List[str] | None = None

field weight : int | None = 1

pydantic model eole.config.data.DataConfig[source]

Bases: VocabConfig

Show JSON schema
{
"title": "DataConfig",
"type": "object",
"properties": {
"src_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"description": "Path to src (or shared) vocabulary file. Format: one <word> or <word>\t<count> per line.",
"title": "Src Vocab"
},
"tgt_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to tgt vocabulary file. Format: one <word> or <word>\t<count> per line.",
"title": "Tgt Vocab"
},
"share_vocab": {
"default": false,
"description": "Share source and target vocabulary.",
"title": "Share Vocab",
"type": "boolean"
},
"decoder_start_token": {
"default": "&lt;s&gt;",
"description": "Default decoder start token. For most models it is &lt;s&gt; = BOS. Some fairseq models require &lt;/s&gt;.",
"title": "Decoder Start Token",
"type": "string"
},
"bos_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "&lt;s&gt;",
"title": "Bos Token"
},
"eos_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "&lt;/s&gt;",
"title": "Eos Token"
},
"unk_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "<unk>",
"title": "Unk Token"
},
"pad_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "<blank>",
"title": "Pad Token"
},
"both_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for both source and target tokens.",
"title": "Both Embeddings"
},
"src_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for source tokens.",
"title": "Src Embeddings"
},
"tgt_embeddings": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to the embeddings file to use for target tokens.",
"title": "Tgt Embeddings"
},
"embeddings_type": {
"anyOf": [
{
"enum": [
"GloVe",
"word2vec"
],
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Type of embeddings file.",
"title": "Embeddings Type"
},
"src_vocab_size": {
"default": 32758,
"description": "Maximum size of the source vocabulary.",
"title": "Src Vocab Size",
"type": "integer"
},
"tgt_vocab_size": {
"default": 32768,
"description": "Maximum size of the target vocabulary.",
"title": "Tgt Vocab Size",
"type": "integer"
},
"vocab_size_multiple": {
"default": 8,
"description": "Make the vocabulary size a multiple of this value. (Adds dummy tokens if needed.)",
"title": "Vocab Size Multiple",
"type": "integer"
},
"src_words_min_frequency": {
"default": 0,
"description": "Discard source words with lower frequency.",
"title": "Src Words Min Frequency",
"type": "integer"
},
"tgt_words_min_frequency": {
"default": 0,
"description": "Discard target words with lower frequency.",
"title": "Tgt Words Min Frequency",
"type": "integer"
},
"data": {
"anyOf": [
{
"additionalProperties": {
"$ref": "#/$defs/Dataset"
},
"type": "object"
},
{
"type": "null"
}
],
"description": "All datasets and their specifications. See examples/*.yaml for further details.",
"title": "Data"
},
"transforms": {
"default": [],
"description": "Default transform pipeline to apply to data. Can be specified in each corpus of data to override.",
"items": {
"type": "string"
},
"title": "Transforms",
"type": "array"
},
"transforms_configs": {
"anyOf": [
{
"$ref": "#/$defs/NestedAllTransformsConfig"
},
{
"type": "null"
}
]
},
"skip_empty_level": {
"default": "warning",
"description": "Logging level when encoutering empty examples. (silent: silently ignore/skip empty examples, warning: warn when ignoring/skipping empty examples, error: raise an error and stop execution when any empty example)",
"enum": [
"silent",
"warning",
"error"
],
"title": "Skip Empty Level",
"type": "string"
},
"n_sample": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"title": "N Sample"
},
"save_data": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Output base path for objects that will be saved (vocab, transforms, embeddings, ...)",
"title": "Save Data"
},
"overwrite": {
"default": false,
"description": "Overwrite existing objects if any.",
"title": "Overwrite",
"type": "boolean"
}
},
"$defs": {
"BARTNoiseConfig": {
"additionalProperties": false,
"properties": {
"permute_sent_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Permute this proportion of sentences (boundaries defined by ['.', '?', '!']) in all inputs.",
"title": "Permute Sent Ratio"
},
"rotate_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Rotate this proportion of inputs.",
"title": "Rotate Ratio"
},
"insert_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Insert this percentage of additional random tokens.",
"title": "Insert Ratio"
},
"random_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Instead of using <mask>, use random token this often.",
"title": "Random Ratio"
},
"mask_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Fraction of words/subwords that will be masked.",
"title": "Mask Ratio"
},
"mask_length": {
"anyOf": [
{
"enum": [
"subword",
"word",
"span-poisson"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "subword",
"description": "Length of masking window to apply.",
"title": "Mask Length"
},
"poisson_lambda": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3.0,
"description": "Lambda for Poisson distribution to sample span length if `-mask_length` set to span-poisson.",
"title": "Poisson Lambda"
},
"replace_length": {
"anyOf": [
{
"maximum": 1,
"minimum": -1,
"type": "integer"
},
{
"type": "null"
}
],
"default": -1,
"description": "When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)",
"title": "Replace Length"
}
},
"title": "BARTNoiseConfig",
"type": "object"
},
"BaseTokenizerConfig": {
"additionalProperties": false,
"properties": {
"src_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for src (or shared).",
"title": "Src Subword Model"
},
"tgt_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for tgt.",
"title": "Tgt Subword Model"
},
"src_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)",
"title": "Src Subword Nbest"
},
"tgt_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)",
"title": "Tgt Subword Nbest"
},
"src_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)",
"title": "Src Subword Alpha"
},
"tgt_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)",
"title": "Tgt Subword Alpha"
},
"src_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for src subword. Format: <word>\\t<count> per line.",
"title": "Src Subword Vocab"
},
"tgt_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for tgt subword. Format: <word>\\t<count> per line.",
"title": "Tgt Subword Vocab"
},
"src_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.",
"title": "Src Vocab Threshold"
},
"tgt_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.",
"title": "Tgt Vocab Threshold"
}
},
"title": "BaseTokenizerConfig",
"type": "object"
},
"CleanConfig": {
"additionalProperties": false,
"properties": {
"src_eq_tgt": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex src==tgt",
"title": "Src Eq Tgt"
},
"same_char": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex with same char more than 4 times",
"title": "Same Char"
},
"same_word": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex with same word more than 3 times",
"title": "Same Word"
},
"scripts_ok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Latin",
"Common"
],
"description": "list of unicodata scripts accepted",
"title": "Scripts Ok"
},
"scripts_nok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "list of unicodata scripts not accepted",
"title": "Scripts Nok"
},
"src_tgt_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 2.0,
"description": "ratio between src and tgt",
"title": "Src Tgt Ratio"
},
"avg_tok_min": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3.0,
"description": "average length of tokens min",
"title": "Avg Tok Min"
},
"avg_tok_max": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 20.0,
"description": "average length of tokens max",
"title": "Avg Tok Max"
},
"langid": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "list of languages accepted",
"title": "Langid"
}
},
"title": "CleanConfig",
"type": "object"
},
"Dataset": {
"additionalProperties": false,
"properties": {
"name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Name"
},
"weight": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"title": "Weight"
},
"transforms": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"title": "Transforms"
},
"path_src": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Src"
},
"path_tgt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Tgt"
},
"path_sco": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Sco"
},
"path_txt": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Txt"
},
"path_align": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path Align"
},
"src_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Src Prefix"
},
"tgt_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Tgt Prefix"
},
"src_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Src Suffix"
},
"tgt_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Tgt Suffix"
},
"src_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Src Lang"
},
"tgt_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Tgt Lang"
},
"penn": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Penn"
},
"norm_quote_commas": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Norm Quote Commas"
},
"norm_numbers": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Norm Numbers"
},
"pre_replace_unicode_punct": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Pre Replace Unicode Punct"
},
"post_remove_control_chars": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"title": "Post Remove Control Chars"
},
"src_eq_tgt": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Src Eq Tgt"
},
"same_char": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Same Char"
},
"same_word": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"title": "Same Word"
},
"scripts_ok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Latin",
"Common"
],
"title": "Scripts Ok"
},
"scripts_nok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"title": "Scripts Nok"
},
"src_tgt_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 2,
"title": "Src Tgt Ratio"
},
"avg_tok_min": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3,
"title": "Avg Tok Min"
},
"avg_tok_max": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 20,
"title": "Avg Tok Max"
},
"lang_id": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"en",
"fr"
],
"title": "Lang Id"
}
},
"title": "Dataset",
"type": "object"
},
"DocifyConfig": {
"additionalProperties": false,
"properties": {
"doc_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 200,
"description": "Number of tokens per doc.",
"title": "Doc Length"
},
"max_context": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Max context segments.",
"title": "Max Context"
}
},
"title": "DocifyConfig",
"type": "object"
},
"FilterTooLongConfig": {
"additionalProperties": false,
"properties": {
"src_seq_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 192,
"description": "Maximum source sequence length.",
"title": "Src Seq Length"
},
"tgt_seq_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 192,
"description": "Maximum target sequence length.",
"title": "Tgt Seq Length"
}
},
"title": "FilterTooLongConfig",
"type": "object"
},
"HuggingfaceTokenizerConfig": {
"additionalProperties": false,
"properties": {
"path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Huggingface Model"
},
"max_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"title": "Max Length"
}
},
"title": "HuggingfaceTokenizerConfig",
"type": "object"
},
"InlineTagsConfig": {
"additionalProperties": false,
"properties": {
"tags_dictionary_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a flat term dictionary.",
"title": "Tags Dictionary Path"
},
"tags_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.1,
"description": "Ratio of corpus to augment with tags.",
"title": "Tags Corpus Ratio"
},
"max_tags": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 12,
"description": "Maximum number of tags that can be added to a single sentence.",
"title": "Max Tags"
},
"paired_stag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_beg\uff60",
"description": "The format of an opening paired inline tag. Must include the character #.",
"title": "Paired Stag"
},
"paired_etag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_end\uff60",
"description": "The format of a closing paired inline tag. Must include the character #.",
"title": "Paired Etag"
},
"isolated_tag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_std\uff60",
"description": "The format of an isolated inline tag. Must include the character #.",
"title": "Isolated Tag"
},
"src_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.",
"title": "Src Delimiter"
}
},
"title": "InlineTagsConfig",
"type": "object"
},
"InsertMaskBeforePlaceholderConfig": {
"additionalProperties": false,
"properties": {
"response_patterns": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Response : \uff5fnewline\uff60"
],
"description": "Response pattern to locate the end of the prompt.",
"title": "Response Patterns"
}
},
"title": "InsertMaskBeforePlaceholderConfig",
"type": "object"
},
"NestedAllTransformsConfig": {
"additionalProperties": false,
"properties": {
"switchout": {
"$ref": "#/$defs/SwitchOutConfig",
"default": {
"switchout_temperature": 1.0
}
},
"tokendrop": {
"$ref": "#/$defs/TokenDropConfig",
"default": {
"tokendrop_temperature": 1.0
}
},
"tokenmask": {
"$ref": "#/$defs/TokenMaskConfig",
"default": {
"tokenmask_temperature": 1.0
}
},
"sentencepiece": {
"$ref": "#/$defs/BaseTokenizerConfig",
"default": {
"src_subword_model": null,
"tgt_subword_model": null,
"src_subword_nbest": 1,
"tgt_subword_nbest": 1,
"src_subword_alpha": 0.0,
"tgt_subword_alpha": 0.0,
"src_subword_vocab": "",
"tgt_subword_vocab": "",
"src_vocab_threshold": 0,
"tgt_vocab_threshold": 0
}
},
"bpe": {
"$ref": "#/$defs/BaseTokenizerConfig",
"default": {
"src_subword_model": null,
"tgt_subword_model": null,
"src_subword_nbest": 1,
"tgt_subword_nbest": 1,
"src_subword_alpha": 0.0,
"tgt_subword_alpha": 0.0,
"src_subword_vocab": "",
"tgt_subword_vocab": "",
"src_vocab_threshold": 0,
"tgt_vocab_threshold": 0
}
},
"onmt_tokenize": {
"$ref": "#/$defs/ONMTTokenizerConfig",
"default": {
"src_subword_model": null,
"tgt_subword_model": null,
"src_subword_nbest": 1,
"tgt_subword_nbest": 1,
"src_subword_alpha": 0.0,
"tgt_subword_alpha": 0.0,
"src_subword_vocab": "",
"tgt_subword_vocab": "",
"src_vocab_threshold": 0,
"tgt_vocab_threshold": 0,
"src_subword_type": "none",
"tgt_subword_type": "none",
"src_onmttok_kwargs": {
"mode": "none"
},
"tgt_onmttok_kwargs": {
"mode": "none"
},
"gpt2_pretok": false,
"mapped_tokens": null
}
},
"inlinetags": {
"$ref": "#/$defs/InlineTagsConfig",
"default": {
"tags_dictionary_path": null,
"tags_corpus_ratio": 0.1,
"max_tags": 12,
"paired_stag": "\uff5fph_#_beg\uff60",
"paired_etag": "\uff5fph_#_end\uff60",
"isolated_tag": "\uff5fph_#_std\uff60",
"src_delimiter": "\uff5ffuzzy\uff60"
}
},
"huggingface_tokenize": {
"$ref": "#/$defs/HuggingfaceTokenizerConfig",
"default": {
"path": null,
"huggingface_model": null,
"max_length": null
}
},
"uppercase": {
"$ref": "#/$defs/UpperCaseConfig",
"default": {
"upper_corpus_ratio": 0.01
}
},
"bart": {
"$ref": "#/$defs/BARTNoiseConfig",
"default": {
"permute_sent_ratio": 0.0,
"rotate_ratio": 0.0,
"insert_ratio": 0.0,
"random_ratio": 0.0,
"mask_ratio": 0.0,
"mask_length": "subword",
"poisson_lambda": 3.0,
"replace_length": -1
}
},
"docify": {
"$ref": "#/$defs/DocifyConfig",
"default": {
"doc_length": 200,
"max_context": 1
}
},
"normalize": {
"$ref": "#/$defs/NormalizeConfig",
"default": {
"src_lang": "",
"tgt_lang": "",
"penn": true,
"norm_quote_commas": true,
"norm_numbers": true,
"pre_replace_unicode_punct": false,
"post_remove_control_chars": false
}
},
"terminology": {
"$ref": "#/$defs/TerminologyConfig",
"default": {
"termbase_path": null,
"src_spacy_language_model": null,
"tgt_spacy_language_model": null,
"term_corpus_ratio": 0.3,
"term_example_ratio": 0.2,
"src_term_stoken": "\uff5fsrc_term_start\uff60",
"tgt_term_stoken": "\uff5ftgt_term_start\uff60",
"tgt_term_etoken": "\uff5ftgt_term_end\uff60",
"term_source_delimiter": "\uff5ffuzzy\uff60"
}
},
"clean": {
"$ref": "#/$defs/CleanConfig",
"default": {
"src_eq_tgt": false,
"same_char": false,
"same_word": false,
"scripts_ok": [
"Latin",
"Common"
],
"scripts_nok": [],
"src_tgt_ratio": 2.0,
"avg_tok_min": 3.0,
"avg_tok_max": 20.0,
"langid": []
}
},
"filtertoolong": {
"$ref": "#/$defs/FilterTooLongConfig",
"default": {
"src_seq_length": 192,
"tgt_seq_length": 192
}
},
"prefix": {
"$ref": "#/$defs/PrefixConfig",
"default": {
"src_prefix": "",
"tgt_prefix": ""
}
},
"suffix": {
"$ref": "#/$defs/SuffixConfig",
"default": {
"src_suffix": "",
"tgt_suffix": ""
}
},
"insert_mask_before_placeholder": {
"$ref": "#/$defs/InsertMaskBeforePlaceholderConfig",
"default": {
"response_patterns": [
"Response : \uff5fnewline\uff60"
]
}
}
},
"title": "NestedAllTransformsConfig",
"type": "object"
},
"NormalizeConfig": {
"additionalProperties": false,
"properties": {
"src_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Source language code",
"title": "Src Lang"
},
"tgt_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Target language code",
"title": "Tgt Lang"
},
"penn": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Penn substitution",
"title": "Penn"
},
"norm_quote_commas": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Normalize quotations and commas",
"title": "Norm Quote Commas"
},
"norm_numbers": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Normalize numbers",
"title": "Norm Numbers"
},
"pre_replace_unicode_punct": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Replace unicode punct",
"title": "Pre Replace Unicode Punct"
},
"post_remove_control_chars": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove control chars",
"title": "Post Remove Control Chars"
}
},
"title": "NormalizeConfig",
"type": "object"
},
"ONMTTokenizerConfig": {
"additionalProperties": false,
"properties": {
"src_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for src (or shared).",
"title": "Src Subword Model"
},
"tgt_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for tgt.",
"title": "Tgt Subword Model"
},
"src_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)",
"title": "Src Subword Nbest"
},
"tgt_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)",
"title": "Tgt Subword Nbest"
},
"src_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)",
"title": "Src Subword Alpha"
},
"tgt_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)",
"title": "Tgt Subword Alpha"
},
"src_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for src subword. Format: <word>\\t<count> per line.",
"title": "Src Subword Vocab"
},
"tgt_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for tgt subword. Format: <word>\\t<count> per line.",
"title": "Tgt Subword Vocab"
},
"src_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.",
"title": "Src Vocab Threshold"
},
"tgt_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.",
"title": "Tgt Vocab Threshold"
},
"src_subword_type": {
"anyOf": [
{
"enum": [
"none",
"sentencepiece",
"bpe"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "none",
"description": "Type of subword model for src (or shared) in pyonmttok.",
"title": "Src Subword Type"
},
"tgt_subword_type": {
"anyOf": [
{
"enum": [
"none",
"sentencepiece",
"bpe"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "none",
"description": "Type of subword model for tgt in pyonmttok.",
"title": "Tgt Subword Type"
},
"src_onmttok_kwargs": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"default": {
"mode": "none"
},
"description": "Other pyonmttok options for src in dict string, except subword related options listed earlier.",
"title": "Src Onmttok Kwargs"
},
"tgt_onmttok_kwargs": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"default": {
"mode": "none"
},
"description": "Other pyonmttok options for tgt in dict string, except subword related options listed earlier.",
"title": "Tgt Onmttok Kwargs"
},
"gpt2_pretok": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Preprocess sentence with byte-level mapping.",
"title": "Gpt2 Pretok"
},
"mapped_tokens": {
"anyOf": [
{
"items": {
"maxItems": 2,
"minItems": 2,
"prefixItems": [
{
"type": "string"
},
{
"type": "string"
}
],
"type": "array"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Mapped tokens for placeholders preservation",
"title": "Mapped Tokens"
}
},
"title": "ONMTTokenizerConfig",
"type": "object"
},
"PrefixConfig": {
"additionalProperties": false,
"properties": {
"src_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to prepend to all source examples.",
"title": "Src Prefix"
},
"tgt_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to prepend to all target examples.",
"title": "Tgt Prefix"
}
},
"title": "PrefixConfig",
"type": "object"
},
"SuffixConfig": {
"additionalProperties": false,
"properties": {
"src_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to append to all source examples.",
"title": "Src Suffix"
},
"tgt_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to append to all target examples.",
"title": "Tgt Suffix"
}
},
"title": "SuffixConfig",
"type": "object"
},
"SwitchOutConfig": {
"additionalProperties": false,
"properties": {
"switchout_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for SwitchOut. :math:`\\tau^{-1}` in :cite:`DBLP:journals/corr/abs-1808-07512`. Smaller value makes data more diverse.",
"title": "Switchout Temperature"
}
},
"title": "SwitchOutConfig",
"type": "object"
},
"TerminologyConfig": {
"additionalProperties": false,
"properties": {
"termbase_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a dictionary file with terms.",
"title": "Termbase Path"
},
"src_spacy_language_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the spaCy language model for the source corpus.",
"title": "Src Spacy Language Model"
},
"tgt_spacy_language_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the spaCy language model for the target corpus.",
"title": "Tgt Spacy Language Model"
},
"term_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.3,
"description": "Ratio of corpus to augment with terms.",
"title": "Term Corpus Ratio"
},
"term_example_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.2,
"description": "Maximum terms allowed in an example.",
"title": "Term Example Ratio"
},
"src_term_stoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fsrc_term_start\uff60",
"description": "The source term start token.",
"title": "Src Term Stoken"
},
"tgt_term_stoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ftgt_term_start\uff60",
"description": "The target term start token.",
"title": "Tgt Term Stoken"
},
"tgt_term_etoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ftgt_term_end\uff60",
"description": "The target term end token.",
"title": "Tgt Term Etoken"
},
"term_source_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.",
"title": "Term Source Delimiter"
}
},
"title": "TerminologyConfig",
"type": "object"
},
"TokenDropConfig": {
"additionalProperties": false,
"properties": {
"tokendrop_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for token deletion.",
"title": "Tokendrop Temperature"
}
},
"title": "TokenDropConfig",
"type": "object"
},
"TokenMaskConfig": {
"additionalProperties": false,
"properties": {
"tokenmask_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for token masking.",
"title": "Tokenmask Temperature"
}
},
"title": "TokenMaskConfig",
"type": "object"
},
"UpperCaseConfig": {
"additionalProperties": false,
"properties": {
"upper_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.01,
"description": "Corpus ratio to apply uppercasing.",
"title": "Upper Corpus Ratio"
}
},
"title": "UpperCaseConfig",
"type": "object"
}
},
"additionalProperties": false,
"required": [
"src_vocab",
"data"
]
}

field data : Dict[str, Dataset] | None [Required]

All datasets and their specifications. See examples/

*

.yaml for further details.

  • Validated by:
    • _maybe_set_huggingface_model

field n_sample : int | None = 0

  • Validated by:
    • _maybe_set_huggingface_model

field overwrite : bool = False

Overwrite existing objects if any.

  • Validated by:
    • _maybe_set_huggingface_model

field save_data : str | None = None

Output base path for objects that will be saved (vocab, transforms, embeddings, …)

  • Validated by:
    • _maybe_set_huggingface_model

field skip_empty_level : Literal['silent', 'warning', 'error'] = 'warning'

Logging level when encoutering empty examples. (silent: silently ignore/skip empty examples, warning: warn when ignoring/skipping empty examples, error: raise an error and stop execution when any empty example)

  • Validated by:
    • _maybe_set_huggingface_model

field transforms : List[str] = []

Default transform pipeline to apply to data. Can be specified in each corpus of data to override.

  • Validated by:
    • _maybe_set_huggingface_model

field transforms_configs : NestedAllTransformsConfig | None [Optional]

  • Validated by:
    • _maybe_set_huggingface_model
    • _str_to_dict

model_post_init(context: Any, /)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that’s what pydantic-core passes when calling it.

  • Parameters:
    • self – The BaseModel instance.
    • context – The context.

pydantic model eole.config.data.NestedAllTransformsConfig

Bases: Config

Show JSON schema
{
"title": "NestedAllTransformsConfig",
"type": "object",
"properties": {
"switchout": {
"$ref": "#/$defs/SwitchOutConfig",
"default": {
"switchout_temperature": 1.0
}
},
"tokendrop": {
"$ref": "#/$defs/TokenDropConfig",
"default": {
"tokendrop_temperature": 1.0
}
},
"tokenmask": {
"$ref": "#/$defs/TokenMaskConfig",
"default": {
"tokenmask_temperature": 1.0
}
},
"sentencepiece": {
"$ref": "#/$defs/BaseTokenizerConfig",
"default": {
"src_subword_model": null,
"tgt_subword_model": null,
"src_subword_nbest": 1,
"tgt_subword_nbest": 1,
"src_subword_alpha": 0.0,
"tgt_subword_alpha": 0.0,
"src_subword_vocab": "",
"tgt_subword_vocab": "",
"src_vocab_threshold": 0,
"tgt_vocab_threshold": 0
}
},
"bpe": {
"$ref": "#/$defs/BaseTokenizerConfig",
"default": {
"src_subword_model": null,
"tgt_subword_model": null,
"src_subword_nbest": 1,
"tgt_subword_nbest": 1,
"src_subword_alpha": 0.0,
"tgt_subword_alpha": 0.0,
"src_subword_vocab": "",
"tgt_subword_vocab": "",
"src_vocab_threshold": 0,
"tgt_vocab_threshold": 0
}
},
"onmt_tokenize": {
"$ref": "#/$defs/ONMTTokenizerConfig",
"default": {
"src_subword_model": null,
"tgt_subword_model": null,
"src_subword_nbest": 1,
"tgt_subword_nbest": 1,
"src_subword_alpha": 0.0,
"tgt_subword_alpha": 0.0,
"src_subword_vocab": "",
"tgt_subword_vocab": "",
"src_vocab_threshold": 0,
"tgt_vocab_threshold": 0,
"src_subword_type": "none",
"tgt_subword_type": "none",
"src_onmttok_kwargs": {
"mode": "none"
},
"tgt_onmttok_kwargs": {
"mode": "none"
},
"gpt2_pretok": false,
"mapped_tokens": null
}
},
"inlinetags": {
"$ref": "#/$defs/InlineTagsConfig",
"default": {
"tags_dictionary_path": null,
"tags_corpus_ratio": 0.1,
"max_tags": 12,
"paired_stag": "\uff5fph_#_beg\uff60",
"paired_etag": "\uff5fph_#_end\uff60",
"isolated_tag": "\uff5fph_#_std\uff60",
"src_delimiter": "\uff5ffuzzy\uff60"
}
},
"huggingface_tokenize": {
"$ref": "#/$defs/HuggingfaceTokenizerConfig",
"default": {
"path": null,
"huggingface_model": null,
"max_length": null
}
},
"uppercase": {
"$ref": "#/$defs/UpperCaseConfig",
"default": {
"upper_corpus_ratio": 0.01
}
},
"bart": {
"$ref": "#/$defs/BARTNoiseConfig",
"default": {
"permute_sent_ratio": 0.0,
"rotate_ratio": 0.0,
"insert_ratio": 0.0,
"random_ratio": 0.0,
"mask_ratio": 0.0,
"mask_length": "subword",
"poisson_lambda": 3.0,
"replace_length": -1
}
},
"docify": {
"$ref": "#/$defs/DocifyConfig",
"default": {
"doc_length": 200,
"max_context": 1
}
},
"normalize": {
"$ref": "#/$defs/NormalizeConfig",
"default": {
"src_lang": "",
"tgt_lang": "",
"penn": true,
"norm_quote_commas": true,
"norm_numbers": true,
"pre_replace_unicode_punct": false,
"post_remove_control_chars": false
}
},
"terminology": {
"$ref": "#/$defs/TerminologyConfig",
"default": {
"termbase_path": null,
"src_spacy_language_model": null,
"tgt_spacy_language_model": null,
"term_corpus_ratio": 0.3,
"term_example_ratio": 0.2,
"src_term_stoken": "\uff5fsrc_term_start\uff60",
"tgt_term_stoken": "\uff5ftgt_term_start\uff60",
"tgt_term_etoken": "\uff5ftgt_term_end\uff60",
"term_source_delimiter": "\uff5ffuzzy\uff60"
}
},
"clean": {
"$ref": "#/$defs/CleanConfig",
"default": {
"src_eq_tgt": false,
"same_char": false,
"same_word": false,
"scripts_ok": [
"Latin",
"Common"
],
"scripts_nok": [],
"src_tgt_ratio": 2.0,
"avg_tok_min": 3.0,
"avg_tok_max": 20.0,
"langid": []
}
},
"filtertoolong": {
"$ref": "#/$defs/FilterTooLongConfig",
"default": {
"src_seq_length": 192,
"tgt_seq_length": 192
}
},
"prefix": {
"$ref": "#/$defs/PrefixConfig",
"default": {
"src_prefix": "",
"tgt_prefix": ""
}
},
"suffix": {
"$ref": "#/$defs/SuffixConfig",
"default": {
"src_suffix": "",
"tgt_suffix": ""
}
},
"insert_mask_before_placeholder": {
"$ref": "#/$defs/InsertMaskBeforePlaceholderConfig",
"default": {
"response_patterns": [
"Response : \uff5fnewline\uff60"
]
}
}
},
"$defs": {
"BARTNoiseConfig": {
"additionalProperties": false,
"properties": {
"permute_sent_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Permute this proportion of sentences (boundaries defined by ['.', '?', '!']) in all inputs.",
"title": "Permute Sent Ratio"
},
"rotate_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Rotate this proportion of inputs.",
"title": "Rotate Ratio"
},
"insert_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Insert this percentage of additional random tokens.",
"title": "Insert Ratio"
},
"random_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Instead of using <mask>, use random token this often.",
"title": "Random Ratio"
},
"mask_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Fraction of words/subwords that will be masked.",
"title": "Mask Ratio"
},
"mask_length": {
"anyOf": [
{
"enum": [
"subword",
"word",
"span-poisson"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "subword",
"description": "Length of masking window to apply.",
"title": "Mask Length"
},
"poisson_lambda": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3.0,
"description": "Lambda for Poisson distribution to sample span length if `-mask_length` set to span-poisson.",
"title": "Poisson Lambda"
},
"replace_length": {
"anyOf": [
{
"maximum": 1,
"minimum": -1,
"type": "integer"
},
{
"type": "null"
}
],
"default": -1,
"description": "When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)",
"title": "Replace Length"
}
},
"title": "BARTNoiseConfig",
"type": "object"
},
"BaseTokenizerConfig": {
"additionalProperties": false,
"properties": {
"src_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for src (or shared).",
"title": "Src Subword Model"
},
"tgt_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for tgt.",
"title": "Tgt Subword Model"
},
"src_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)",
"title": "Src Subword Nbest"
},
"tgt_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)",
"title": "Tgt Subword Nbest"
},
"src_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)",
"title": "Src Subword Alpha"
},
"tgt_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)",
"title": "Tgt Subword Alpha"
},
"src_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for src subword. Format: <word>\\t<count> per line.",
"title": "Src Subword Vocab"
},
"tgt_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for tgt subword. Format: <word>\\t<count> per line.",
"title": "Tgt Subword Vocab"
},
"src_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.",
"title": "Src Vocab Threshold"
},
"tgt_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.",
"title": "Tgt Vocab Threshold"
}
},
"title": "BaseTokenizerConfig",
"type": "object"
},
"CleanConfig": {
"additionalProperties": false,
"properties": {
"src_eq_tgt": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex src==tgt",
"title": "Src Eq Tgt"
},
"same_char": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex with same char more than 4 times",
"title": "Same Char"
},
"same_word": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex with same word more than 3 times",
"title": "Same Word"
},
"scripts_ok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Latin",
"Common"
],
"description": "list of unicodata scripts accepted",
"title": "Scripts Ok"
},
"scripts_nok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "list of unicodata scripts not accepted",
"title": "Scripts Nok"
},
"src_tgt_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 2.0,
"description": "ratio between src and tgt",
"title": "Src Tgt Ratio"
},
"avg_tok_min": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3.0,
"description": "average length of tokens min",
"title": "Avg Tok Min"
},
"avg_tok_max": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 20.0,
"description": "average length of tokens max",
"title": "Avg Tok Max"
},
"langid": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "list of languages accepted",
"title": "Langid"
}
},
"title": "CleanConfig",
"type": "object"
},
"DocifyConfig": {
"additionalProperties": false,
"properties": {
"doc_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 200,
"description": "Number of tokens per doc.",
"title": "Doc Length"
},
"max_context": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Max context segments.",
"title": "Max Context"
}
},
"title": "DocifyConfig",
"type": "object"
},
"FilterTooLongConfig": {
"additionalProperties": false,
"properties": {
"src_seq_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 192,
"description": "Maximum source sequence length.",
"title": "Src Seq Length"
},
"tgt_seq_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 192,
"description": "Maximum target sequence length.",
"title": "Tgt Seq Length"
}
},
"title": "FilterTooLongConfig",
"type": "object"
},
"HuggingfaceTokenizerConfig": {
"additionalProperties": false,
"properties": {
"path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Path"
},
"huggingface_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Huggingface Model"
},
"max_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"title": "Max Length"
}
},
"title": "HuggingfaceTokenizerConfig",
"type": "object"
},
"InlineTagsConfig": {
"additionalProperties": false,
"properties": {
"tags_dictionary_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a flat term dictionary.",
"title": "Tags Dictionary Path"
},
"tags_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.1,
"description": "Ratio of corpus to augment with tags.",
"title": "Tags Corpus Ratio"
},
"max_tags": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 12,
"description": "Maximum number of tags that can be added to a single sentence.",
"title": "Max Tags"
},
"paired_stag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_beg\uff60",
"description": "The format of an opening paired inline tag. Must include the character #.",
"title": "Paired Stag"
},
"paired_etag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_end\uff60",
"description": "The format of a closing paired inline tag. Must include the character #.",
"title": "Paired Etag"
},
"isolated_tag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_std\uff60",
"description": "The format of an isolated inline tag. Must include the character #.",
"title": "Isolated Tag"
},
"src_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.",
"title": "Src Delimiter"
}
},
"title": "InlineTagsConfig",
"type": "object"
},
"InsertMaskBeforePlaceholderConfig": {
"additionalProperties": false,
"properties": {
"response_patterns": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Response : \uff5fnewline\uff60"
],
"description": "Response pattern to locate the end of the prompt.",
"title": "Response Patterns"
}
},
"title": "InsertMaskBeforePlaceholderConfig",
"type": "object"
},
"NormalizeConfig": {
"additionalProperties": false,
"properties": {
"src_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Source language code",
"title": "Src Lang"
},
"tgt_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Target language code",
"title": "Tgt Lang"
},
"penn": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Penn substitution",
"title": "Penn"
},
"norm_quote_commas": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Normalize quotations and commas",
"title": "Norm Quote Commas"
},
"norm_numbers": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Normalize numbers",
"title": "Norm Numbers"
},
"pre_replace_unicode_punct": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Replace unicode punct",
"title": "Pre Replace Unicode Punct"
},
"post_remove_control_chars": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove control chars",
"title": "Post Remove Control Chars"
}
},
"title": "NormalizeConfig",
"type": "object"
},
"ONMTTokenizerConfig": {
"additionalProperties": false,
"properties": {
"src_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for src (or shared).",
"title": "Src Subword Model"
},
"tgt_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for tgt.",
"title": "Tgt Subword Model"
},
"src_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)",
"title": "Src Subword Nbest"
},
"tgt_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)",
"title": "Tgt Subword Nbest"
},
"src_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)",
"title": "Src Subword Alpha"
},
"tgt_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)",
"title": "Tgt Subword Alpha"
},
"src_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for src subword. Format: <word>\\t<count> per line.",
"title": "Src Subword Vocab"
},
"tgt_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for tgt subword. Format: <word>\\t<count> per line.",
"title": "Tgt Subword Vocab"
},
"src_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.",
"title": "Src Vocab Threshold"
},
"tgt_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.",
"title": "Tgt Vocab Threshold"
},
"src_subword_type": {
"anyOf": [
{
"enum": [
"none",
"sentencepiece",
"bpe"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "none",
"description": "Type of subword model for src (or shared) in pyonmttok.",
"title": "Src Subword Type"
},
"tgt_subword_type": {
"anyOf": [
{
"enum": [
"none",
"sentencepiece",
"bpe"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "none",
"description": "Type of subword model for tgt in pyonmttok.",
"title": "Tgt Subword Type"
},
"src_onmttok_kwargs": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"default": {
"mode": "none"
},
"description": "Other pyonmttok options for src in dict string, except subword related options listed earlier.",
"title": "Src Onmttok Kwargs"
},
"tgt_onmttok_kwargs": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"default": {
"mode": "none"
},
"description": "Other pyonmttok options for tgt in dict string, except subword related options listed earlier.",
"title": "Tgt Onmttok Kwargs"
},
"gpt2_pretok": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Preprocess sentence with byte-level mapping.",
"title": "Gpt2 Pretok"
},
"mapped_tokens": {
"anyOf": [
{
"items": {
"maxItems": 2,
"minItems": 2,
"prefixItems": [
{
"type": "string"
},
{
"type": "string"
}
],
"type": "array"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Mapped tokens for placeholders preservation",
"title": "Mapped Tokens"
}
},
"title": "ONMTTokenizerConfig",
"type": "object"
},
"PrefixConfig": {
"additionalProperties": false,
"properties": {
"src_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to prepend to all source examples.",
"title": "Src Prefix"
},
"tgt_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to prepend to all target examples.",
"title": "Tgt Prefix"
}
},
"title": "PrefixConfig",
"type": "object"
},
"SuffixConfig": {
"additionalProperties": false,
"properties": {
"src_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to append to all source examples.",
"title": "Src Suffix"
},
"tgt_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to append to all target examples.",
"title": "Tgt Suffix"
}
},
"title": "SuffixConfig",
"type": "object"
},
"SwitchOutConfig": {
"additionalProperties": false,
"properties": {
"switchout_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for SwitchOut. :math:`\\tau^{-1}` in :cite:`DBLP:journals/corr/abs-1808-07512`. Smaller value makes data more diverse.",
"title": "Switchout Temperature"
}
},
"title": "SwitchOutConfig",
"type": "object"
},
"TerminologyConfig": {
"additionalProperties": false,
"properties": {
"termbase_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a dictionary file with terms.",
"title": "Termbase Path"
},
"src_spacy_language_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the spaCy language model for the source corpus.",
"title": "Src Spacy Language Model"
},
"tgt_spacy_language_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the spaCy language model for the target corpus.",
"title": "Tgt Spacy Language Model"
},
"term_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.3,
"description": "Ratio of corpus to augment with terms.",
"title": "Term Corpus Ratio"
},
"term_example_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.2,
"description": "Maximum terms allowed in an example.",
"title": "Term Example Ratio"
},
"src_term_stoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fsrc_term_start\uff60",
"description": "The source term start token.",
"title": "Src Term Stoken"
},
"tgt_term_stoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ftgt_term_start\uff60",
"description": "The target term start token.",
"title": "Tgt Term Stoken"
},
"tgt_term_etoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ftgt_term_end\uff60",
"description": "The target term end token.",
"title": "Tgt Term Etoken"
},
"term_source_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.",
"title": "Term Source Delimiter"
}
},
"title": "TerminologyConfig",
"type": "object"
},
"TokenDropConfig": {
"additionalProperties": false,
"properties": {
"tokendrop_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for token deletion.",
"title": "Tokendrop Temperature"
}
},
"title": "TokenDropConfig",
"type": "object"
},
"TokenMaskConfig": {
"additionalProperties": false,
"properties": {
"tokenmask_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for token masking.",
"title": "Tokenmask Temperature"
}
},
"title": "TokenMaskConfig",
"type": "object"
},
"UpperCaseConfig": {
"additionalProperties": false,
"properties": {
"upper_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.01,
"description": "Corpus ratio to apply uppercasing.",
"title": "Upper Corpus Ratio"
}
},
"title": "UpperCaseConfig",
"type": "object"
}
},
"additionalProperties": false
}

field bart : BARTNoiseConfig = BARTNoiseConfig(permute_sent_ratio=0.0, rotate_ratio=0.0, insert_ratio=0.0, random_ratio=0.0, mask_ratio=0.0, mask_length='subword', poisson_lambda=3.0, replace_length=-1)

field bpe : BaseTokenizerConfig = BaseTokenizerConfig(src_subword_model=None, tgt_subword_model=None, src_subword_nbest=1, tgt_subword_nbest=1, src_subword_alpha=0.0, tgt_subword_alpha=0.0, src_subword_vocab='', tgt_subword_vocab='', src_vocab_threshold=0, tgt_vocab_threshold=0)

field clean : CleanConfig = CleanConfig(src_eq_tgt=False, same_char=False, same_word=False, scripts_ok=['Latin', 'Common'], scripts_nok=[], src_tgt_ratio=2.0, avg_tok_min=3.0, avg_tok_max=20.0, langid=[])

field docify : DocifyConfig = DocifyConfig(doc_length=200, max_context=1)

field filtertoolong : FilterTooLongConfig = FilterTooLongConfig(src_seq_length=192, tgt_seq_length=192)

field huggingface_tokenize : HuggingfaceTokenizerConfig = HuggingfaceTokenizerConfig(path=None, huggingface_model=None, max_length=None)

field inlinetags : InlineTagsConfig = InlineTagsConfig(tags_dictionary_path=None, tags_corpus_ratio=0.1, max_tags=12, paired_stag='⦅ph_#beg⦆', paired_etag='⦅ph#end⦆', isolated_tag='⦅ph#_std⦆', src_delimiter='⦅fuzzy⦆')

field insert_mask_before_placeholder : InsertMaskBeforePlaceholderConfig = InsertMaskBeforePlaceholderConfig(response_patterns=['Response : ⦅newline⦆'])

field normalize : NormalizeConfig = NormalizeConfig(src_lang='', tgt_lang='', penn=True, norm_quote_commas=True, norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False)

field onmt_tokenize : ONMTTokenizerConfig = ONMTTokenizerConfig(src_subword_model=None, tgt_subword_model=None, src_subword_nbest=1, tgt_subword_nbest=1, src_subword_alpha=0.0, tgt_subword_alpha=0.0, src_subword_vocab='', tgt_subword_vocab='', src_vocab_threshold=0, tgt_vocab_threshold=0, src_subword_type='none', tgt_subword_type='none', src_onmttok_kwargs={'mode': 'none'}, tgt_onmttok_kwargs={'mode': 'none'}, gpt2_pretok=False, mapped_tokens=None)

field prefix : PrefixConfig = PrefixConfig(src_prefix='', tgt_prefix='')

field sentencepiece : BaseTokenizerConfig = BaseTokenizerConfig(src_subword_model=None, tgt_subword_model=None, src_subword_nbest=1, tgt_subword_nbest=1, src_subword_alpha=0.0, tgt_subword_alpha=0.0, src_subword_vocab='', tgt_subword_vocab='', src_vocab_threshold=0, tgt_vocab_threshold=0)

field suffix : SuffixConfig = SuffixConfig(src_suffix='', tgt_suffix='')

field switchout : SwitchOutConfig = SwitchOutConfig(switchout_temperature=1.0)

field terminology : TerminologyConfig = TerminologyConfig(termbase_path=None, src_spacy_language_model=None, tgt_spacy_language_model=None, term_corpus_ratio=0.3, term_example_ratio=0.2, src_term_stoken='⦅src_term_start⦆', tgt_term_stoken='⦅tgt_term_start⦆', tgt_term_etoken='⦅tgt_term_end⦆', term_source_delimiter='⦅fuzzy⦆')

field tokendrop : TokenDropConfig = TokenDropConfig(tokendrop_temperature=1.0)

field tokenmask : TokenMaskConfig = TokenMaskConfig(tokenmask_temperature=1.0)

field uppercase : UpperCaseConfig = UpperCaseConfig(upper_corpus_ratio=0.01)