modelscope · pan-x-c · May 23, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -187,39 +187,30 @@ Support `nccl` and `checkpoint`, `nccl` represents that model weights in `explor
 ```yaml
 trainer:
   trainer_type: 'verl'
-  trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
   save_interval: 100
+  trainer_config_path: 'examples/ppo_countdown/train_countdown.yaml'
 ```
 
 - `trainer.trainer_type`: The backend of the trainer, Only `verl` is supported.
-- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
 - `trainer.save_interval`: The interval steps between two checkpoints. Default is `100`.
 
+- `trainer.actor_grad_clip`: Gradient clip for actor model training.
+- `trainer.actor_clip_ratio`: Used for compute policy loss.
+- `trainer.actor_entropy_coeff`: Used for compute policy loss.
+- `trainer.actor_use_kl_loss`: Whether to enable kl loss.
+- `trainer.actor_kl_loss_coef`: The coefficient of kl loss.
+
+- `trainer.train_config`: The configuration of the trainer. Only one needs to be set for `trainer.trainer_config` and `trainer.trainer_config_path`
+- `trainer.trainer_config_path`: The path to the trainer configuration file. It must be set manually.
+
 ### veRL Trainer Configuration
 
 Here we mainly introduce the parameters that can be set in veRL. For the specific meaning of the parameters, please refer to the official document of [veRL](https://github.com/volcengine/verl/blob/0bdf7f469854815177e73dcfe9e420836c952e6e/docs/examples/config.rst).
 
 ```yaml
-data:
-  tokenizer: null
-  train_files: train_example.parquet
-  val_files: test_example.parquet
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
@@ -270,35 +261,6 @@ actor_rollout_ref:
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 1 # > 1 for grpo
 
 critic:
   strategy: fsdp
@@ -309,8 +271,6 @@ critic:
     warmup_style: constant  # select from constant/cosine
     total_training_steps: -1  # must be override by program
   model:
-    path: /PATH/TO/MODEL/CHECKPOINT/
-    tokenizer_path: ${actor_rollout_ref.model.path}
     override_config: { }
     external_lib: ${actor_rollout_ref.model.external_lib}
     enable_gradient_checkpointing: True
@@ -323,7 +283,6 @@ critic:
         min_num_params: 0
       fsdp_size: -1
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
   ppo_micro_batch_size_per_gpu: 8
   forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
   use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
@@ -335,34 +294,13 @@ critic:
   grad_clip: 1.0
   cliprange_value: 0.5
 
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-  reward_manager: tinyzero
-
 custom_reward_function:
   path: null
   name: compute_score
 
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: gae
   norm_adv_by_std_in_grpo: True
   use_kl_in_reward: False
   kl_penalty: kl  # how to estimate kl divergence
@@ -374,24 +312,14 @@ algorithm:
 
 trainer:
   balance_batch: True
-  total_epochs: 15
   # total_training_steps: null
-  project_name: TinyZero
-  experiment_name: trinity-qwen2.5-1.5b
-  logger: [ 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
   resume_from_path: ""
-  test_freq: 100
   critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False
   max_actor_ckpt_to_keep: 5
   max_critic_ckpt_to_keep: 5
@@ -402,11 +330,6 @@ trainer:
 - `actor_rollout_ref.model.use_remove_padding`: Whether to remove pad tokens, which will reduce training time.
 - `actor_rollout_ref.actor.use_dynamic_bsz`: Whether to reorganize the batch data, specifically to splice the shorter data to reduce the batch size in the actual training process.
 - `actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu`: Batch size for one GPU in one forward pass.
-- `actor_rollout_ref.actor.grad_clip`: Gradient clip for actor model training.
-- `actor_rollout_ref.actor.clip_ratio`: Used for compute policy loss.
-- `actor_rollout_ref.actor.entropy_coeff`: Used for compute policy loss.
-- `actor_rollout_ref.actor.use_kl_loss`: Whether to enable kl loss.
-- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss.
 - `actor_rollout_ref.actor.kl_loss_type`: How to compute kl loss, optional value is `kl`, `abs`, `mse` or `low_var_kl`.
 - `actor_rollout_ref.actor.ulysses_sequence_parallel_size`: Ulysses sequence parallel size.
 - `actor_rollout_ref.actor.tau`: strength of regularization w.r.t. old / ref policy.

diff --git a/examples/async_gsm8k/verl_config.yaml b/examples/async_gsm8k/verl_config.yaml
@@ -1,31 +1,13 @@
-data:
-  tokenizer: null
-  train_files: placeholder
-  val_files: placeholder
-  prompt_key: prompt
-  max_prompt_length: 256
-  max_response_length: 1024
-  train_batch_size: 256
-  val_batch_size: null
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: False
-  shuffle: True
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You should disable this and set `truncation='left'
-  truncation: error
-  image_key: images
-
 actor_rollout_ref:
   hybrid_engine: True
   model:
-    path: /PATH/TO/MODEL/
     external_lib: null
     override_config: { }
     enable_gradient_checkpointing: True
     use_remove_padding: True  # False
   actor:
     strategy: fsdp  # This is for backward-compatibility
     ppo_mini_batch_size: 128
-    # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
     ppo_micro_batch_size_per_gpu: 4
     use_dynamic_bsz: True # False
     ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
@@ -61,92 +43,10 @@ actor_rollout_ref:
       wrap_policy:
         # transformer_layer_cls_to_wrap: None
         min_num_params: 0
-    # log_prob_micro_batch_size: 4 # will be deprecated, use log_prob_micro_batch_size_per_gpu
     log_prob_micro_batch_size_per_gpu: 16
     log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
     log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
     ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    use_fire_sampling: False # https://arxiv.org/abs/2410.21236
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.4
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    # log_prob_micro_batch_size: 8 # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 4
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    # min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: /PATH/TO/MODEL/
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  # ppo_micro_batch_size: 8 # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  # micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  # micro_batch_size_per_gpu: 2 # set a number
-  # max_length: null
-  ulysses_sequence_parallel_size: 1 # sp size
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
 
 custom_reward_function:
   path: null
@@ -155,29 +55,17 @@ custom_reward_function:
 algorithm:
   gamma: 1.0
   lam: 1.0
-  adv_estimator: grpo
   kl_penalty: kl  # how to estimate kl divergence
   kl_ctrl:
     type: fixed
     kl_coef: 0.001
 
 trainer:
   balance_batch: True
-  total_epochs: 10
   # total_training_steps: null
-  project_name: rft_example_gsm8k
-  experiment_name: cys-qwen2_1.5b_rollout8_grpo_kl0.001_lr1e-5
-  logger: [ 'console','wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 2
-  save_freq: 100
   # auto: find the last ckpt to resume. If can't find, start from scratch
   resume_mode: auto # or auto or resume_path if
-  test_freq: 5
-  critic_warmup: 0
   default_hdfs_dir: null
   remove_previous_ckpt_in_save: False
   del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   val_before_train: False