Skip to main content

Transforms

pydantic model eole.transforms.tokenize.ONMTTokenizerConfig[source]​

Bases: BaseTokenizerConfig

Show JSON schema
{
"title": "ONMTTokenizerConfig",
"type": "object",
"properties": {
"src_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for src (or shared).",
"title": "Src Subword Model"
},
"tgt_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for tgt.",
"title": "Tgt Subword Model"
},
"src_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)",
"title": "Src Subword Nbest"
},
"tgt_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)",
"title": "Tgt Subword Nbest"
},
"src_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)",
"title": "Src Subword Alpha"
},
"tgt_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)",
"title": "Tgt Subword Alpha"
},
"src_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for src subword. Format: <word>\\t<count> per line.",
"title": "Src Subword Vocab"
},
"tgt_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for tgt subword. Format: <word>\\t<count> per line.",
"title": "Tgt Subword Vocab"
},
"src_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.",
"title": "Src Vocab Threshold"
},
"tgt_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.",
"title": "Tgt Vocab Threshold"
},
"src_subword_type": {
"anyOf": [
{
"enum": [
"none",
"sentencepiece",
"bpe"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "none",
"description": "Type of subword model for src (or shared) in pyonmttok.",
"title": "Src Subword Type"
},
"tgt_subword_type": {
"anyOf": [
{
"enum": [
"none",
"sentencepiece",
"bpe"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "none",
"description": "Type of subword model for tgt in pyonmttok.",
"title": "Tgt Subword Type"
},
"src_onmttok_kwargs": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"default": {
"mode": "none"
},
"description": "Other pyonmttok options for src in dict string, except subword related options listed earlier.",
"title": "Src Onmttok Kwargs"
},
"tgt_onmttok_kwargs": {
"anyOf": [
{
"type": "object"
},
{
"type": "null"
}
],
"default": {
"mode": "none"
},
"description": "Other pyonmttok options for tgt in dict string, except subword related options listed earlier.",
"title": "Tgt Onmttok Kwargs"
},
"gpt2_pretok": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Preprocess sentence with byte-level mapping.",
"title": "Gpt2 Pretok"
},
"mapped_tokens": {
"anyOf": [
{
"items": {
"maxItems": 2,
"minItems": 2,
"prefixItems": [
{
"type": "string"
},
{
"type": "string"
}
],
"type": "array"
},
"type": "array"
},
{
"type": "null"
}
],
"default": null,
"description": "Mapped tokens for placeholders preservation",
"title": "Mapped Tokens"
}
},
"additionalProperties": false
}

field gpt2_pretok : bool | None = False​

Preprocess sentence with byte-level mapping.

field mapped_tokens : List[Tuple[str, str]] | None = None​

Mapped tokens for placeholders preservation

field src_onmttok_kwargs : dict | None = {'mode': 'none'}​

Other pyonmttok options for src in dict string, except subword related options listed earlier.

field src_subword_type : Literal['none', 'sentencepiece', 'bpe'] | None = 'none'​

Type of subword model for src (or shared) in pyonmttok.

field tgt_onmttok_kwargs : dict | None = {'mode': 'none'}​

Other pyonmttok options for tgt in dict string, except subword related options listed earlier.

field tgt_subword_type : Literal['none', 'sentencepiece', 'bpe'] | None = 'none'​

Type of subword model for tgt in pyonmttok.

validator check_values Β» all fields[source]​

pydantic model eole.transforms.tokenize.BaseTokenizerConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "BaseTokenizerConfig",
"type": "object",
"properties": {
"src_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for src (or shared).",
"title": "Src Subword Model"
},
"tgt_subword_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path of subword model for tgt.",
"title": "Tgt Subword Model"
},
"src_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)",
"title": "Src Subword Nbest"
},
"tgt_subword_nbest": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)",
"title": "Tgt Subword Nbest"
},
"src_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)",
"title": "Src Subword Alpha"
},
"tgt_subword_alpha": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0,
"description": "Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)",
"title": "Tgt Subword Alpha"
},
"src_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for src subword. Format: <word>\\t<count> per line.",
"title": "Src Subword Vocab"
},
"tgt_subword_vocab": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Path to the vocabulary file for tgt subword. Format: <word>\\t<count> per line.",
"title": "Tgt Subword Vocab"
},
"src_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.",
"title": "Src Vocab Threshold"
},
"tgt_vocab_threshold": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 0,
"description": "Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.",
"title": "Tgt Vocab Threshold"
}
},
"additionalProperties": false
}

field src_subword_alpha : float | None = 0​

Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)

field src_subword_model : str | None = None​

Path of subword model for src (or shared).

field src_subword_nbest : int | None = 1​

Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)

field src_subword_vocab : str | None = ''​

Path to the vocabulary file for src subword. Format: t per line.

field src_vocab_threshold : int | None = 0​

Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.

field tgt_subword_alpha : float | None = 0​

Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)

field tgt_subword_model : str | None = None​

Path of subword model for tgt.

field tgt_subword_nbest : int | None = 1​

Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)

field tgt_subword_vocab : str | None = ''​

Path to the vocabulary file for tgt subword. Format: t per line.

field tgt_vocab_threshold : int | None = 0​

Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.

validator check_values Β» all fields[source]​

pydantic model eole.transforms.docify.DocifyConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "DocifyConfig",
"type": "object",
"properties": {
"doc_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 200,
"description": "Number of tokens per doc.",
"title": "Doc Length"
},
"max_context": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 1,
"description": "Max context segments.",
"title": "Max Context"
}
},
"additionalProperties": false
}

field doc_length : int | None = 200​

Number of tokens per doc.

field max_context : int | None = 1​

Max context segments.

pydantic model eole.transforms.clean.CleanConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "CleanConfig",
"type": "object",
"properties": {
"src_eq_tgt": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex src==tgt",
"title": "Src Eq Tgt"
},
"same_char": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex with same char more than 4 times",
"title": "Same Char"
},
"same_word": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove ex with same word more than 3 times",
"title": "Same Word"
},
"scripts_ok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Latin",
"Common"
],
"description": "list of unicodata scripts accepted",
"title": "Scripts Ok"
},
"scripts_nok": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "list of unicodata scripts not accepted",
"title": "Scripts Nok"
},
"src_tgt_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 2.0,
"description": "ratio between src and tgt",
"title": "Src Tgt Ratio"
},
"avg_tok_min": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3.0,
"description": "average length of tokens min",
"title": "Avg Tok Min"
},
"avg_tok_max": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 20.0,
"description": "average length of tokens max",
"title": "Avg Tok Max"
},
"langid": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "list of languages accepted",
"title": "Langid"
}
},
"additionalProperties": false
}

field avg_tok_max : float | None = 20.0​

average length of tokens max

field avg_tok_min : float | None = 3.0​

average length of tokens min

field langid : List[str] | None = []​

list of languages accepted

field same_char : bool | None = False​

Remove ex with same char more than 4 times

field same_word : bool | None = False​

Remove ex with same word more than 3 times

field scripts_nok : List[str] | None = []​

list of unicodata scripts not accepted

field scripts_ok : List[str] | None = ['Latin', 'Common']​

list of unicodata scripts accepted

field src_eq_tgt : bool | None = False​

Remove ex src==tgt

field src_tgt_ratio : float | None = 2.0​

ratio between src and tgt

pydantic model eole.transforms.bart.BARTNoiseConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "BARTNoiseConfig",
"type": "object",
"properties": {
"permute_sent_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Permute this proportion of sentences (boundaries defined by ['.', '?', '!']) in all inputs.",
"title": "Permute Sent Ratio"
},
"rotate_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Rotate this proportion of inputs.",
"title": "Rotate Ratio"
},
"insert_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Insert this percentage of additional random tokens.",
"title": "Insert Ratio"
},
"random_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Instead of using <mask>, use random token this often.",
"title": "Random Ratio"
},
"mask_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.0,
"description": "Fraction of words/subwords that will be masked.",
"title": "Mask Ratio"
},
"mask_length": {
"anyOf": [
{
"enum": [
"subword",
"word",
"span-poisson"
],
"type": "string"
},
{
"type": "null"
}
],
"default": "subword",
"description": "Length of masking window to apply.",
"title": "Mask Length"
},
"poisson_lambda": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 3.0,
"description": "Lambda for Poisson distribution to sample span length if `-mask_length` set to span-poisson.",
"title": "Poisson Lambda"
},
"replace_length": {
"anyOf": [
{
"maximum": 1,
"minimum": -1,
"type": "integer"
},
{
"type": "null"
}
],
"default": -1,
"description": "When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)",
"title": "Replace Length"
}
},
"additionalProperties": false
}

field insert_ratio : float | None = 0.0​

Insert this percentage of additional random tokens.

field mask_length : Literal['subword', 'word', 'span-poisson'] | None = 'subword'​

Length of masking window to apply.

field mask_ratio : float | None = 0.0​

Fraction of words/subwords that will be masked.

field permute_sent_ratio : float | None = 0.0​

Permute this proportion of sentences (boundaries defined by [β€˜.’, β€˜?’, β€˜!’]) in all inputs.

field poisson_lambda : float | None = 3.0​

Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

field random_ratio : float | None = 0.0​

Instead of using , use random token this often.

field replace_length : int | None = -1​

When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

  • Constraints:
    • ge = -1
    • le = 1

field rotate_ratio : float | None = 0.0​

Rotate this proportion of inputs.

pydantic model eole.transforms.fuzzymatch.FuzzyMatchConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "FuzzyMatchConfig",
"type": "object",
"properties": {
"tm_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a flat text TM.",
"title": "Tm Path"
},
"fuzzy_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.1,
"description": "Ratio of corpus to augment with fuzzy matches.",
"title": "Fuzzy Corpus Ratio"
},
"fuzzy_threshold": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 70,
"description": "The fuzzy matching threshold.",
"title": "Fuzzy Threshold"
},
"tm_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\t",
"description": "The delimiter used in the flat text TM.",
"title": "Tm Delimiter"
},
"fuzzy_token": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "The fuzzy token to be added with the matches.",
"title": "Fuzzy Token"
},
"fuzzymatch_min_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 70,
"description": "Max length for TM entries and examples to match.",
"title": "Fuzzymatch Min Length"
}
},
"additionalProperties": false
}

field fuzzy_corpus_ratio : float | None = 0.1​

Ratio of corpus to augment with fuzzy matches.

field fuzzy_threshold : float | None = 70​

The fuzzy matching threshold.

field fuzzy_token : str | None = '⦅fuzzyο½ '​

The fuzzy token to be added with the matches.

field fuzzymatch_min_length : int | None = 70​

Max length for TM entries and examples to match.

field tm_delimiter : str | None = '\t'​

The delimiter used in the flat text TM.

field tm_path : str | None = None​

Path to a flat text TM.

pydantic model eole.transforms.inlinetags.InlineTagsConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "InlineTagsConfig",
"type": "object",
"properties": {
"tags_dictionary_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a flat term dictionary.",
"title": "Tags Dictionary Path"
},
"tags_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.1,
"description": "Ratio of corpus to augment with tags.",
"title": "Tags Corpus Ratio"
},
"max_tags": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 12,
"description": "Maximum number of tags that can be added to a single sentence.",
"title": "Max Tags"
},
"paired_stag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_beg\uff60",
"description": "The format of an opening paired inline tag. Must include the character #.",
"title": "Paired Stag"
},
"paired_etag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_end\uff60",
"description": "The format of a closing paired inline tag. Must include the character #.",
"title": "Paired Etag"
},
"isolated_tag": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fph_#_std\uff60",
"description": "The format of an isolated inline tag. Must include the character #.",
"title": "Isolated Tag"
},
"src_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.",
"title": "Src Delimiter"
}
},
"additionalProperties": false
}

field isolated_tag : str | None = '⦅ph_#_stdο½ '​

The format of an isolated inline tag. Must include the character #.

field max_tags : int | None = 12​

Maximum number of tags that can be added to a single sentence.

field paired_etag : str | None = '⦅ph_#_endο½ '​

The format of a closing paired inline tag. Must include the character #.

field paired_stag : str | None = '⦅ph_#_begο½ '​

The format of an opening paired inline tag. Must include the character #.

field src_delimiter : str | None = '⦅fuzzyο½ '​

Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

field tags_corpus_ratio : float | None = 0.1​

Ratio of corpus to augment with tags.

field tags_dictionary_path : str | None = None​

Path to a flat term dictionary.

pydantic model eole.transforms.uppercase.UpperCaseConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "UpperCaseConfig",
"type": "object",
"properties": {
"upper_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.01,
"description": "Corpus ratio to apply uppercasing.",
"title": "Upper Corpus Ratio"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field upper_corpus_ratio : float | None = 0.01​

Corpus ratio to apply uppercasing.

pydantic model eole.transforms.sampling.TokenDropConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "TokenDropConfig",
"type": "object",
"properties": {
"tokendrop_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for token deletion.",
"title": "Tokendrop Temperature"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field tokendrop_temperature : float | None = 1.0​

Sampling temperature for token deletion.

pydantic model eole.transforms.sampling.TokenMaskConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "TokenMaskConfig",
"type": "object",
"properties": {
"tokenmask_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for token masking.",
"title": "Tokenmask Temperature"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field tokenmask_temperature : float | None = 1.0​

Sampling temperature for token masking.

pydantic model eole.transforms.sampling.SwitchOutConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "SwitchOutConfig",
"type": "object",
"properties": {
"switchout_temperature": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 1.0,
"description": "Sampling temperature for SwitchOut. :math:`\\tau^{-1}` in :cite:`DBLP:journals/corr/abs-1808-07512`. Smaller value makes data more diverse.",
"title": "Switchout Temperature"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field switchout_temperature : float | None = 1.0​

Sampling temperature for SwitchOut. $\tau^{-1}$ in []. Smaller value makes data more diverse.

pydantic model eole.transforms.terminology.TerminologyConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "TerminologyConfig",
"type": "object",
"properties": {
"termbase_path": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Path to a dictionary file with terms.",
"title": "Termbase Path"
},
"src_spacy_language_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the spaCy language model for the source corpus.",
"title": "Src Spacy Language Model"
},
"tgt_spacy_language_model": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"description": "Name of the spaCy language model for the target corpus.",
"title": "Tgt Spacy Language Model"
},
"term_corpus_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.3,
"description": "Ratio of corpus to augment with terms.",
"title": "Term Corpus Ratio"
},
"term_example_ratio": {
"anyOf": [
{
"type": "number"
},
{
"type": "null"
}
],
"default": 0.2,
"description": "Maximum terms allowed in an example.",
"title": "Term Example Ratio"
},
"src_term_stoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5fsrc_term_start\uff60",
"description": "The source term start token.",
"title": "Src Term Stoken"
},
"tgt_term_stoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ftgt_term_start\uff60",
"description": "The target term start token.",
"title": "Tgt Term Stoken"
},
"tgt_term_etoken": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ftgt_term_end\uff60",
"description": "The target term end token.",
"title": "Tgt Term Etoken"
},
"term_source_delimiter": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "\uff5ffuzzy\uff60",
"description": "Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.",
"title": "Term Source Delimiter"
}
},
"additionalProperties": false
}

field src_spacy_language_model : str | None = None​

Name of the spaCy language model for the source corpus.

field src_term_stoken : str | None = '⦅src_term_startο½ '​

The source term start token.

field term_corpus_ratio : float | None = 0.3​

Ratio of corpus to augment with terms.

field term_example_ratio : float | None = 0.2​

Maximum terms allowed in an example.

field term_source_delimiter : str | None = '⦅fuzzyο½ '​

Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

field termbase_path : str | None = None​

Path to a dictionary file with terms.

field tgt_spacy_language_model : str | None = None​

Name of the spaCy language model for the target corpus.

field tgt_term_etoken : str | None = '⦅tgt_term_endο½ '​

The target term end token.

field tgt_term_stoken : str | None = '⦅tgt_term_startο½ '​

The target term start token.

pydantic model eole.transforms.misc.FilterTooLongConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "FilterTooLongConfig",
"type": "object",
"properties": {
"src_seq_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 192,
"description": "Maximum source sequence length.",
"title": "Src Seq Length"
},
"tgt_seq_length": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": 192,
"description": "Maximum target sequence length.",
"title": "Tgt Seq Length"
}
},
"additionalProperties": false
}

field src_seq_length : int | None = 192​

Maximum source sequence length.

field tgt_seq_length : int | None = 192​

Maximum target sequence length.

pydantic model eole.transforms.misc.PrefixConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "PrefixConfig",
"type": "object",
"properties": {
"src_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to prepend to all source examples.",
"title": "Src Prefix"
},
"tgt_prefix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to prepend to all target examples.",
"title": "Tgt Prefix"
}
},
"additionalProperties": false
}

field src_prefix : str | None = ''​

String to prepend to all source examples.

field tgt_prefix : str | None = ''​

String to prepend to all target examples.

pydantic model eole.transforms.misc.SuffixConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "SuffixConfig",
"type": "object",
"properties": {
"src_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to append to all source examples.",
"title": "Src Suffix"
},
"tgt_suffix": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "String to append to all target examples.",
"title": "Tgt Suffix"
}
},
"additionalProperties": false
}

field src_suffix : str | None = ''​

String to append to all source examples.

field tgt_suffix : str | None = ''​

String to append to all target examples.

pydantic model eole.transforms.normalize.NormalizeConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "NormalizeConfig",
"type": "object",
"properties": {
"src_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Source language code",
"title": "Src Lang"
},
"tgt_lang": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "",
"description": "Target language code",
"title": "Tgt Lang"
},
"penn": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Penn substitution",
"title": "Penn"
},
"norm_quote_commas": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Normalize quotations and commas",
"title": "Norm Quote Commas"
},
"norm_numbers": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": true,
"description": "Normalize numbers",
"title": "Norm Numbers"
},
"pre_replace_unicode_punct": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Replace unicode punct",
"title": "Pre Replace Unicode Punct"
},
"post_remove_control_chars": {
"anyOf": [
{
"type": "boolean"
},
{
"type": "null"
}
],
"default": false,
"description": "Remove control chars",
"title": "Post Remove Control Chars"
}
},
"additionalProperties": false
}

field norm_numbers : bool | None = True​

Normalize numbers

field norm_quote_commas : bool | None = True​

Normalize quotations and commas

field penn : bool | None = True​

Penn substitution

field post_remove_control_chars : bool | None = False​

Remove control chars

field pre_replace_unicode_punct : bool | None = False​

Replace unicode punct

field src_lang : str | None = ''​

Source language code

field tgt_lang : str | None = ''​

Target language code

pydantic model eole.transforms.insert_mask_before_placeholder.InsertMaskBeforePlaceholderConfig[source]​

Bases: TransformConfig

Show JSON schema
{
"title": "InsertMaskBeforePlaceholderConfig",
"type": "object",
"properties": {
"response_patterns": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [
"Response : \uff5fnewline\uff60"
],
"description": "Response pattern to locate the end of the prompt.",
"title": "Response Patterns"
}
},
"additionalProperties": false
}

  • Config:
    • validate_assignment: bool = True
    • validate_default: bool = True
    • use_enum_values: bool = True
    • extra: str = forbid
    • protected_namespaces: tuple = ()
  • Fields:

field response_patterns : List[str] | None = ['Response : ⦅newlineο½ ']​

Response pattern to locate the end of the prompt.