Skip to main content

Inference

pydantic model eole.config.inference.DecodingConfig[source]โ€‹

Bases: Config

Show JSON schema
{
"title": "DecodingConfig",
"type": "object",
"properties": {
"beam_size": {
"default": 5,
"description": "Beam size.",
"title": "Beam Size",
"type": "integer"
},
"ratio": {
"default": -0.0,
"description": "Ratio based beam stop condition.",
"title": "Ratio",
"type": "number"
},
"top_k": {
"default": 0,
"description": "Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.",
"title": "Top K",
"type": "integer"
},
"top_p": {
"default": 0.0,
"description": "Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0,1]. (https://arxiv.org/abs/1904.09751)",
"lte": 1.0,
"minimum": 0.0,
"title": "Top P",
"type": "number"
},
"temperature": {
"default": 1.0,
"description": "If doing random sampling, divide the logits by this before computing softmax during decoding.",
"title": "Temperature",
"type": "number"
},
"length_penalty": {
"default": "avg",
"description": "Length penalty to use.",
"enum": [
"avg",
"wu",
"none"
],
"title": "Length Penalty",
"type": "string"
},
"alpha": {
"default": 1.0,
"description": "Length penalty parameter (higher = longer generation)",
"title": "Alpha",
"type": "number"
},
"coverage_penalty": {
"default": "none",
"description": "Coverage penalty to use. Only available in beam search.",
"enum": [
"none",
"wu",
"summary"
],
"title": "Coverage Penalty",
"type": "string"
},
"beta": {
"default": -0.0,
"description": "Coverage penalty parameter.",
"title": "Beta",
"type": "number"
},
"stepwise_penalty": {
"default": false,
"description": "Apply coverage penalty at every decoding step. Helpful for summary penalty.",
"title": "Stepwise Penalty",
"type": "boolean"
},
"min_length": {
"default": 0,
"description": "Minimum prediction length.",
"minimum": 0,
"title": "Min Length",
"type": "integer"
},
"max_length": {
"default": 250,
"description": "Maximum prediction length.",
"title": "Max Length",
"type": "integer"
},
"max_length_ratio": {
"default": 2,
"description": "Maximum prediction length ratio. For European languages, 2 is large enough, for target Asian charageters, need to increase to 2-3, for special languages (Burmese, Amharic) to 10.",
"minimum": 1.0,
"title": "Max Length Ratio",
"type": "number"
},
"block_ngram_repeat": {
"default": 0,
"description": "Block repetition of ngrams during decoding.",
"title": "Block Ngram Repeat",
"type": "integer"
},
"ignore_when_blocking": {
"default": [],
"description": "Ignore these strings when blocking repeats. You want to block sentence delimiters.",
"items": {
"type": "string"
},
"title": "Ignore When Blocking",
"type": "array"
},
"replace_unk": {
"default": false,
"description": "Replace the generated UNK tokens with the source token that had the highest attention weight. If phrase_table is provided, it will lok up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.",
"title": "Replace Unk",
"type": "boolean"
},
"ban_unk_token": {
"default": false,
"description": "Prevent unk token generation by setting unk probability to 0.",
"title": "Ban Unk Token",
"type": "boolean"
},
"phrase_table": {
"default": "",
"description": "If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token.",
"title": "Phrase Table",
"type": "string"
},
"n_best": {
"default": 1,
"description": "Output the n_best decoded sentences.",
"title": "N Best",
"type": "integer"
},
"dump_beam": {
"default": "",
"description": "File to dump beam information to.",
"title": "Dump Beam",
"type": "string"
},
"verbose": {
"default": false,
"description": "Print scores and predictions for each input.",
"title": "Verbose",
"type": "boolean"
},
"with_score": {
"default": false,
"description": "Add a tab separated score to each output.",
"title": "With Score",
"type": "boolean"
},
"attn_debug": {
"default": false,
"description": "Print best attn for each word.",
"title": "Attn Debug",
"type": "boolean"
},
"align_debug": {
"default": false,
"description": "Print best align for each word.",
"title": "Align Debug",
"type": "boolean"
}
},
"additionalProperties": false
}

field align_debug : bool = Falseโ€‹

Print best align for each word.

field alpha : float = 1.0โ€‹

Length penalty parameter (higher = longer generation)

field attn_debug : bool = Falseโ€‹

Print best attn for each word.

field ban_unk_token : bool = Falseโ€‹

Prevent unk token generation by setting unk probability to 0.

field beam_size : int = 5โ€‹

Beam size.

field beta : float = -0.0โ€‹

Coverage penalty parameter.

field block_ngram_repeat : int = 0โ€‹

Block repetition of ngrams during decoding.

field coverage_penalty : Literal['none', 'wu', 'summary'] = 'none'โ€‹

Coverage penalty to use. Only available in beam search.

field dump_beam : str = ''โ€‹

File to dump beam information to.

field ignore_when_blocking : List[str] = []โ€‹

Ignore these strings when blocking repeats. You want to block sentence delimiters.

field length_penalty : Literal['avg', 'wu', 'none'] = 'avg'โ€‹

Length penalty to use.

field max_length : int = 250โ€‹

Maximum prediction length.

field max_length_ratio : float = 2โ€‹

Maximum prediction length ratio. For European languages, 2 is large enough, for target Asian charageters, need to increase to 2-3, for special languages (Burmese, Amharic) to 10.

  • Constraints:
    • ge = 1

field min_length : int = 0โ€‹

Minimum prediction length.

  • Constraints:
    • ge = 0

field n_best : int = 1โ€‹

Output the n_best decoded sentences.

field phrase_table : str = ''โ€‹

If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token.

field ratio : float = -0.0โ€‹

Ratio based beam stop condition.

field replace_unk : bool = Falseโ€‹

Replace the generated UNK tokens with the source token that had the highest attention weight. If phrase_table is provided, it will lok up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.

field stepwise_penalty : bool = Falseโ€‹

Apply coverage penalty at every decoding step. Helpful for summary penalty.

field temperature : float = 1.0โ€‹

If doing random sampling, divide the logits by this before computing softmax during decoding.

field top_k : int = 0โ€‹

Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.

field top_p : float = 0.0โ€‹

Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0,1]. (https://arxiv.org/abs/1904.09751)

  • Constraints:
    • ge = 0.0

field verbose : bool = Falseโ€‹

Print scores and predictions for each input.

field with_score : bool = Falseโ€‹

Add a tab separated score to each output.

pydantic model eole.config.inference.InferenceConfig[source]โ€‹

Bases: RunningConfig, DecodingConfig, LoRaConfig, QuantizeConfig

Show JSON schema
{
"title": "InferenceConfig",
"type": "object",
"properties": {
"quant_layers": {
"default": [],
"description": "List of layers to be compressed in 4/8bit.",
"items": {
"type": "string"
},
"title": "Quant Layers",
"type": "array"
},
"quant_type": {
"default": "",
"description": "Type of compression.",
"enum": [
"",
"bnb_8bit",
"bnb_FP4",
"bnb_NF4",
"awq_gemm",
"awq_gemv"
],
"title": "Quant Type",
"type": "string"
},
"w_bit": {
"default": 4,
"description": "W_bit quantization",
"title": "W Bit",
"type": "integer"
},
"group_size": {
"default": 128,
"description": "Group size quantization.",
"title": "Group Size",
"type": "integer"
},
"lora_layers": {
"default": [],
"description": "List of layers to be replaced by LoRa layers. E.g. ['linear_values', 'linear_query'] (\u00a74.2 in https://arxiv.org/abs/2106.09685)",
"items": {
"type": "string"
},
"title": "Lora Layers",
"type": "array"
},
"lora_embedding": {
"default": false,
"description": "Replace embeddings with LoRa Embeddings (\u00a75.1)",
"title": "Lora Embedding",
"type": "boolean"
},
"lora_rank": {
"default": 2,
"description": "r=2 successfully tested with NLLB-200 3.3B",
"title": "Lora Rank",
"type": "integer"
},
"lora_alpha": {
"default": 1,
"description": "\u00a74.1 https://arxiv.org/abs/2106.09685",
"title": "Lora Alpha",
"type": "integer"
},
"lora_dropout": {
"default": 0.0,
"description": "Rule of thumb: same value as in main model.",
"title": "Lora Dropout",
"type": "number"
},
"beam_size": {
"default": 5,
"description": "Beam size.",
"title": "Beam Size",
"type": "integer"
},
"ratio": {
"default": -0.0,
"description": "Ratio based beam stop condition.",
"title": "Ratio",
"type": "number"
},
"top_k": {
"default": 0,
"description": "Set this to -1 to do random sampling from full distribution. Set this to value k>1 to do random sampling restricted to the k most likely next tokens. Set this to 1 to use argmax.",
"title": "Top K",
"type": "integer"
},
"top_p": {
"default": 0.0,
"description": "Probability for top-p/nucleus sampling. Restrict tokens to the most likely until the cumulated probability is over p. In range [0,1]. (https://arxiv.org/abs/1904.09751)",
"lte": 1.0,
"minimum": 0.0,
"title": "Top P",
"type": "number"
},
"temperature": {
"default": 1.0,
"description": "If doing random sampling, divide the logits by this before computing softmax during decoding.",
"title": "Temperature",
"type": "number"
},
"length_penalty": {
"default": "avg",
"description": "Length penalty to use.",
"enum": [
"avg",
"wu",
"none"
],
"title": "Length Penalty",
"type": "string"
},
"alpha": {
"default": 1.0,
"description": "Length penalty parameter (higher = longer generation)",
"title": "Alpha",
"type": "number"
},
"coverage_penalty": {
"default": "none",
"description": "Coverage penalty to use. Only available in beam search.",
"enum": [
"none",
"wu",
"summary"
],
"title": "Coverage Penalty",
"type": "string"
},
"beta": {
"default": -0.0,
"description": "Coverage penalty parameter.",
"title": "Beta",
"type": "number"
},
"stepwise_penalty": {
"default": false,
"description": "Apply coverage penalty at every decoding step. Helpful for summary penalty.",
"title": "Stepwise Penalty",
"type": "boolean"
},
"min_length": {
"default": 0,
"description": "Minimum prediction length.",
"minimum": 0,
"title": "Min Length",
"type": "integer"
},
"max_length": {
"default": 250,
"description": "Maximum prediction length.",
"title": "Max Length",
"type": "integer"
},
"max_length_ratio": {
"default": 2,
"description": "Maximum prediction length ratio. For European languages, 2 is large enough, for target Asian charageters, need to increase to 2-3, for special languages (Burmese, Amharic) to 10.",
"minimum": 1.0,
"title": "Max Length Ratio",
"type": "number"
},
"block_ngram_repeat": {
"default": 0,
"description": "Block repetition of ngrams during decoding.",
"title": "Block Ngram Repeat",
"type": "integer"
},
"ignore_when_blocking": {
"default": [],
"description": "Ignore these strings when blocking repeats. You want to block sentence delimiters.",
"items": {
"type": "string"
},
"title": "Ignore When Blocking",
"type": "array"
},
"replace_unk": {
"default": false,
"description": "Replace the generated UNK tokens with the source token that had the highest attention weight. If phrase_table is provided, it will lok up the identified source token and give the corresponding target token. If it is not provided (or the identified source token does not exist in the table), then it will copy the source token.",
"title": "Replace Unk",
"type": "boolean"
},
"ban_unk_token": {
"default": false,
"description": "Prevent unk token generation by setting unk probability to 0.",
"title": "Ban Unk Token",
"type": "boolean"
},
"phrase_table": {
"default": "",
"description": "If phrase_table is provided (with replace_unk), it will look up the identified source token and give the corresponding target token.",
"title": "Phrase Table",
"type": "string"
},
"n_best": {
"default": 1,
"description": "Output the n_best decoded sentences.",
"title": "N Best",
"type": "integer"
},
"dump_beam": {
"default": "",
"description": "File to dump beam information to.",
"title": "Dump Beam",
"type": "string"
},
"verbose": {
"default": false,
"description": "Print scores and predictions for each input.",
"title": "Verbose",
"type": "boolean"
},
"with_score": {
"default": false,
"description": "Add a tab separated score to each output.",
"title": "With Score",
"type": "boolean"
},
"attn_debug": {
"default": false,
"description": "Print best attn for each word.",
"title": "Attn Debug",
"type": "boolean"
},
"align_debug": {
"default": false,
"description": "Print best align for each word.",
"title": "Align Debug",
"type": "boolean"
},
"gpu_ranks": {
"default": [],
"description": "List of ranks for each process.",
"items": {
"type": "integer"
},
"title": "Gpu Ranks",
"type": "array"
},
"world_size": {
"default": 1,
"description": "Total number of distributed processes.",
"title": "World Size",
"type": "integer"
},
"parallel_mode": {
"default": "data_parallel",
"description": "Distributed mode.",
"enum": [
"data_parallel",
"tensor_parallel"
],
"title": "Parallel Mode",
"type": "string"
},
"gpu_backend": {
"default": "nccl",
"description": "Type of torch distributed backend.",
"title": "Gpu Backend",
"type": "string"
},
"gpu_verbose_level": {
"default": 0,
"description": "Gives more info on each process per GPU.",
"title": "Gpu Verbose Level",
"type": "integer"
},
"master_ip": {
"default": "localhost",
"description": "IP of master for torch.distributed training.",
"title": "Master Ip",
"type": "string"
},
"master_port": {
"default": 10000,
"description": "Port of master for torch.distributed training.",
"title": "Master Port",
"type": "integer"
},
"timeout": {
"default": 60,
"description": "Timeout for one GPU to wait for the others.",
"title": "Timeout",
"type": "integer"
},
"model_path": {
"default": "model",
"description": "Path to directory containing all model components.",
"title": "Model Path",
"type": "string"
},
"self_attn_backend": {
"default": "flash",
"description": "Self-attention backend.",
"enum": [
"flash",
"pytorch"
],
"title": "Self Attn Backend",
"type": "string"
},
"compute_dtype": {
"description": "Compute dtype (precision) to use for main compute. Some parameters might have other dtypes for specific cases (e.g. torch.amp -- See eole.config.training.TrainingConfig.storage_dtype) fp32 to force slow fp16 model on gtx1080, int8 to enable pytorch native 8-bit quantization (cpu only).",
"enum": [
"fp32",
"fp16",
"int8",
"bf16"
],
"title": "Compute Dtype",
"type": "string"
},
"torch_compile": {
"default": false,
"description": "Use torch.compile with dynamic=True.",
"title": "Torch Compile",
"type": "boolean"
},
"report_align": {
"default": false,
"description": "Report alignment for each translation.",
"title": "Report Align",
"type": "boolean"
},
"gold_align": {
"default": false,
"description": "Report alignment between source and gold target. Useful to test the performance of learnt alignments.",
"title": "Gold Align",
"type": "boolean"
},
"report_time": {
"default": false,
"description": "Report some translation time metrics.",
"title": "Report Time",
"type": "boolean"
},
"profile": {
"default": false,
"description": "Report pytorch profiling stats.",
"title": "Profile",
"type": "boolean"
},
"batch_size": {
"default": 30,
"description": "Batch size.",
"title": "Batch Size",
"type": "integer"
},
"batch_type": {
"default": "sents",
"description": "Batch grouping for batch size.",
"enum": [
"sents",
"tokens"
],
"title": "Batch Type",
"type": "string"
},
"avg_raw_probs": {
"default": false,
"description": "If set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.",
"title": "Avg Raw Probs",
"type": "boolean"
},
"data_type": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": "text",
"title": "Data Type"
},
"chat_template": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"default": null,
"title": "Chat Template"
},
"optional_eos": {
"anyOf": [
{
"items": {
"type": "string"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"description": "Optional EOS tokens that would stop generation, e.g. <|eot_id|> for Llama3",
"title": "Optional Eos"
}
},
"additionalProperties": false
}

field avg_raw_probs : bool = Falseโ€‹

If set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.

  • Validated by:
    • _validate_running_config

field batch_size : int = 30โ€‹

Batch size.

  • Validated by:
    • _validate_running_config

field batch_type : Literal['sents', 'tokens'] = 'sents'โ€‹

Batch grouping for batch size.

  • Validated by:
    • _validate_running_config

field chat_template : str | None = Noneโ€‹

  • Validated by:
    • _validate_running_config

field data_type : str | None = 'text'โ€‹

  • Validated by:
    • _validate_running_config

field gold_align : bool = Falseโ€‹

Report alignment between source and gold target. Useful to test the performance of learnt alignments.

  • Validated by:
    • _validate_running_config

field optional_eos : List[str] | None = []โ€‹

Optional EOS tokens that would stop generation, e.g. <

|eot_id|

for Llama3

  • Validated by:
    • _validate_running_config

field profile : bool = Falseโ€‹

Report pytorch profiling stats.

  • Validated by:
    • _validate_running_config

field report_align : bool = Falseโ€‹

Report alignment for each translation.

  • Validated by:
    • _validate_running_config

field report_time : bool = Falseโ€‹

Report some translation time metrics.

  • Validated by:
    • _validate_running_config

get_model_path()[source]โ€‹

property storage_dtype : dtype[source]โ€‹

Deduce which dtype to use for main model parameters.