| { |
| "architectures": [ |
| "InternVideo2_CLIP_small" |
| ], |
| "auto_map": { |
| "AutoConfig": "config.InternVideo2Config", |
| "AutoModel": "modeling_internvideo2encoder.InternVideo2_CLIP_small" |
| }, |
| "auto_resume": false, |
| "batch_size": 64, |
| "batch_size_test": 4, |
| "best_key": [ |
| "msrvtt_1k_test_match", |
| "t2v_r1" |
| ], |
| "compile_model": false, |
| "criterion": { |
| "clip_loss_ratio": [ |
| 1.0, |
| 1.0 |
| ], |
| "distill_final_features": true, |
| "loss_weight": { |
| "mlm": 1.0, |
| "mvm": 0.0, |
| "uta": 0.0, |
| "vtc": 1.0, |
| "vtm": 1.0 |
| }, |
| "mlm_masking_prob": 0.5, |
| "vtm_hard_neg": true |
| }, |
| "debug": false, |
| "deep_fusion": false, |
| "deepspeed": { |
| "enable": true, |
| "stage": 1 |
| }, |
| "delete_ds_optim_states": true, |
| "device": "cuda", |
| "dist_url": "env://", |
| "evaluate": false, |
| "evaluation": { |
| "eval_frame_ensemble": "concat", |
| "eval_offload": true, |
| "eval_x_only": false, |
| "k_test": 128 |
| }, |
| "gradient_checkpointing": true, |
| "inputs": { |
| "batch_size": { |
| "image": 64, |
| "video": 64 |
| }, |
| "batch_size_test": { |
| "image": 4, |
| "video": 4 |
| }, |
| "image_res": 224, |
| "max_txt_l": { |
| "image": 32, |
| "video": 32 |
| }, |
| "video_input": { |
| "num_frames": 8, |
| "num_frames_test": 8, |
| "random_aug": false, |
| "sample_type": "middle", |
| "sample_type_test": "middle" |
| } |
| }, |
| "jump_evaluate": false, |
| "log_freq": 100, |
| "max_txt_l": 32, |
| "mode": "pt", |
| "model": { |
| "embed_dim": 1024, |
| "find_unused_parameters": false, |
| "freeze_text": true, |
| "freeze_vision": true, |
| "load_vision_ckpt_from_internvideo2_stage2": false, |
| "model_cls": "InternVideo2_CLIP_small", |
| "multimodal": { |
| "enable": true |
| }, |
| "open_text_projection": false, |
| "open_vision_clip_projector": true, |
| "temp": 0.01, |
| "temp_min": 0.01, |
| "text_encoder": { |
| "embed_dim": 512, |
| "image_cfg": { |
| "image_size": 224, |
| "model_name": "vit_b16" |
| }, |
| "text_cfg": { |
| "causal_masking": true, |
| "context_length": 77, |
| "dim": 512, |
| "ffn_multiplier_per_layer": 4.0, |
| "model_name": "base", |
| "n_heads_per_layer": 8, |
| "n_transformer_layers": 12, |
| "norm_layer": "layer_norm_fp32", |
| "vocab_size": 49408 |
| } |
| }, |
| "vision_encoder": { |
| "align_dim": 512, |
| "attn_pool_num_heads": 16, |
| "checkpoint_num": 0, |
| "clip_embed_dim": 768, |
| "depth": 24, |
| "drop_cls_token": false, |
| "drop_path_rate": 0.0, |
| "embed_dim": 1024, |
| "fused_mlp_heuristic": 1, |
| "head_drop_path_rate": 0.0, |
| "img_size": 224, |
| "in_chans": 3, |
| "init_values": 0.1, |
| "layerscale_no_force_fp32": true, |
| "mlp_ratio": 4, |
| "name": "internvideo2_1B", |
| "num_frames": 8, |
| "num_heads": 16, |
| "patch_size": 14, |
| "qk_normalization": true, |
| "qkv_bias": false, |
| "sep_pos_embed": false, |
| "tubelet_size": 1, |
| "use_checkpoint": false, |
| "use_flash_attn": false, |
| "use_fused_mlp": false, |
| "use_fused_rmsnorm": false |
| } |
| }, |
| "model_type": "internvideo2", |
| "num_frames": 8, |
| "num_frames_test": 8, |
| "num_workers": 6, |
| "optimizer": { |
| "different_lr": { |
| "enable": false, |
| "lr": 0.001, |
| "module_names": [] |
| }, |
| "lr": 5e-05, |
| "max_grad_norm": 3.0, |
| "opt": "adamW", |
| "opt_betas": [ |
| 0.9, |
| 0.98 |
| ], |
| "weight_decay": 0.05 |
| }, |
| "output_dir": null, |
| "pretrained_path": "", |
| "resume": false, |
| "save_ckpt_iter": null, |
| "save_latest": true, |
| "scheduler": { |
| "epochs": 10, |
| "min_lr_multi": 0.01, |
| "sched": "cosine", |
| "warmup_epochs": 1 |
| }, |
| "seed": 42, |
| "test_file": { |
| "didemo_ret_test": "available_corpus[\"didemo_ret_test\"]", |
| "msrvtt_1k_test": "available_corpus[\"msrvtt_1k_test\"]" |
| }, |
| "test_types": [ |
| "msrvtt_1k_test", |
| "didemo_ret_test" |
| ], |
| "text_enc": "bert_large", |
| "tokenizer": null, |
| "torch_dtype": "float16", |
| "train_file": "available_corpus[\"pretrain_example_data_1B\"]", |
| "transformers_version": "4.51.3", |
| "use_bf16": true, |
| "use_flash_sdp": false, |
| "use_half_precision": false, |
| "use_mem_efficient_sdp": false, |
| "wandb": { |
| "enable": false, |
| "entity": "opengvlab", |
| "project": "InternVideo2-Stage2" |
| } |
| } |
|
|