RJK commited on
Commit
dc26eb1
·
verified ·
1 Parent(s): 593fee5

upload pi05 and gr00t LIBERO fine-tuned checkpoints (incremental cache)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. gr00t_eagle_3b_libero_10_full_finetune_bs64/checkpoints/step-028548-epoch-18-loss=0.0546.safetensors +3 -0
  3. gr00t_eagle_3b_libero_10_full_finetune_bs64/config.json +6 -9
  4. gr00t_eagle_3b_libero_10_full_finetune_bs64/config.yaml +5 -8
  5. gr00t_eagle_3b_libero_10_full_finetune_bs64/gr00t_eagle_3b_libero_10_full_finetune_2026_05_14_02_41_08.jsonl +0 -0
  6. gr00t_eagle_3b_libero_10_full_finetune_bs64/run-metrics.jsonl +1 -1
  7. gr00t_eagle_3b_libero_10_full_finetune_bs64/tokenizer/merges.txt +1 -1
  8. gr00t_eagle_3b_libero_goal_full_finetune_bs64/checkpoints/step-014886-epoch-18-loss=0.0550.safetensors +3 -0
  9. gr00t_eagle_3b_libero_goal_full_finetune_bs64/config.json +302 -0
  10. gr00t_eagle_3b_libero_goal_full_finetune_bs64/config.yaml +220 -0
  11. gr00t_eagle_3b_libero_goal_full_finetune_bs64/dataset_statistics.json +138 -0
  12. gr00t_eagle_3b_libero_goal_full_finetune_bs64/gr00t_eagle_3b_libero_goal_full_finetune_2026_05_14_02_40_40.jsonl +0 -0
  13. gr00t_eagle_3b_libero_goal_full_finetune_bs64/run-metrics.jsonl +1 -0
  14. gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/added_tokens.json +39 -0
  15. gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/merges.txt +0 -0
  16. gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/special_tokens_map.json +42 -0
  17. gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/tokenizer_config.json +344 -0
  18. gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/vocab.json +0 -0
  19. gr00t_eagle_3b_libero_goal_full_finetune_bs64/vlm_backbone_config.json +106 -0
  20. gr00t_eagle_3b_libero_object_full_finetune_bs64/checkpoints/step-018846-epoch-18-loss=0.0701.safetensors +3 -0
  21. gr00t_eagle_3b_libero_object_full_finetune_bs64/config.json +302 -0
  22. gr00t_eagle_3b_libero_object_full_finetune_bs64/config.yaml +220 -0
  23. gr00t_eagle_3b_libero_object_full_finetune_bs64/dataset_statistics.json +104 -0
  24. gr00t_eagle_3b_libero_object_full_finetune_bs64/gr00t_eagle_3b_libero_object_full_finetune_2026_05_14_02_40_05.jsonl +0 -0
  25. gr00t_eagle_3b_libero_object_full_finetune_bs64/run-metrics.jsonl +1 -0
  26. gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/added_tokens.json +39 -0
  27. gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/merges.txt +0 -0
  28. gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/special_tokens_map.json +42 -0
  29. gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/tokenizer_config.json +344 -0
  30. gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/vocab.json +0 -0
  31. gr00t_eagle_3b_libero_object_full_finetune_bs64/vlm_backbone_config.json +106 -0
  32. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/checkpoints/step-014904-epoch-18-loss=0.0780.safetensors +3 -0
  33. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/config.json +302 -0
  34. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/config.yaml +220 -0
  35. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/dataset_statistics.json +104 -0
  36. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/gr00t_eagle_3b_libero_spatial_full_finetune_2026_05_14_02_41_01.jsonl +0 -0
  37. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/run-metrics.jsonl +1 -0
  38. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/added_tokens.json +39 -0
  39. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/merges.txt +0 -0
  40. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/special_tokens_map.json +42 -0
  41. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/tokenizer_config.json +344 -0
  42. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/vocab.json +0 -0
  43. gr00t_eagle_3b_libero_spatial_full_finetune_bs64/vlm_backbone_config.json +106 -0
  44. pi05_paligemma_libero_10_full_finetune_bs64/checkpoints/step-038064-epoch-24-loss=0.0170.safetensors +3 -0
  45. pi05_paligemma_libero_10_full_finetune_bs64/config.json +3 -4
  46. pi05_paligemma_libero_10_full_finetune_bs64/config.yaml +3 -4
  47. pi05_paligemma_libero_10_full_finetune_bs64/pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10.jsonl +3 -0
  48. pi05_paligemma_libero_10_full_finetune_bs64/run-metrics.jsonl +1 -1
  49. pi05_paligemma_libero_goal_full_finetune_bs64/checkpoints/step-019848-epoch-24-loss=0.0145.safetensors +3 -0
  50. pi05_paligemma_libero_goal_full_finetune_bs64/config.json +355 -0
.gitattributes CHANGED
@@ -37,3 +37,4 @@ pi05_libero/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  pi05_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  pi0_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  gr00t_qwen3vl_0.6b_libero/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
37
  pi05_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  pi0_base/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  gr00t_qwen3vl_0.6b_libero/tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ pi05_paligemma_libero_10_full_finetune_bs64/pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10.jsonl filter=lfs diff=lfs merge=lfs -text
gr00t_eagle_3b_libero_10_full_finetune_bs64/checkpoints/step-028548-epoch-18-loss=0.0546.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2485a81646356973d21f9c08075fbe3ba567f283ab16e31793fd682fdbb27c5
3
+ size 10896783888
gr00t_eagle_3b_libero_10_full_finetune_bs64/config.json CHANGED
@@ -54,7 +54,7 @@
54
  "max_len": 600,
55
  "num_images": 2,
56
  "tokenizer": {
57
- "model_path": "/limx/tos/users/liyinhao/projects/eagle2_hg_model",
58
  "type": "PretrainedTokenizer"
59
  },
60
  "type": "ProcessPromptsWithImage"
@@ -99,8 +99,7 @@
99
  "num_attention_heads": 32,
100
  "num_layers": 16,
101
  "output_dim": 1024,
102
- "positional_embeddings": null,
103
- "use_torch_compile": true
104
  },
105
  "hidden_size": 1024,
106
  "input_embedding_dim": 1536,
@@ -173,12 +172,10 @@
173
  "lr_scheduler_type": "linear-warmup+cosine-decay",
174
  "max_epochs": 18,
175
  "max_grad_norm": 1.0,
176
- "max_keep_ckpts": 1,
177
- "max_steps": null,
178
  "metric": {
179
  "active_trackers": [
180
  "jsonl",
181
- "wandb"
182
  ],
183
  "grad_accumulation_steps": 1,
184
  "run_dir": "work_dirs",
@@ -188,7 +185,7 @@
188
  "mixed_precision_dtype": "bf16",
189
  "sampler": null,
190
  "tokenizer": {
191
- "model_path": "/limx/tos/users/liyinhao/projects/eagle2_hg_model",
192
  "type": "PretrainedTokenizer"
193
  },
194
  "type": "FSDPTrainRunner",
@@ -372,7 +369,7 @@
372
  "max_len": 600,
373
  "num_images": 2,
374
  "tokenizer": {
375
- "model_path": "/limx/tos/users/liyinhao/projects/eagle2_hg_model",
376
  "type": "PretrainedTokenizer"
377
  },
378
  "type": "ProcessPromptsWithImage"
@@ -430,7 +427,7 @@
430
  "proprio"
431
  ]
432
  },
433
- "seed": 1,
434
  "statistic_keys": [
435
  "observation.state",
436
  "timestamp",
 
54
  "max_len": 600,
55
  "num_images": 2,
56
  "tokenizer": {
57
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
58
  "type": "PretrainedTokenizer"
59
  },
60
  "type": "ProcessPromptsWithImage"
 
99
  "num_attention_heads": 32,
100
  "num_layers": 16,
101
  "output_dim": 1024,
102
+ "positional_embeddings": null
 
103
  },
104
  "hidden_size": 1024,
105
  "input_embedding_dim": 1536,
 
172
  "lr_scheduler_type": "linear-warmup+cosine-decay",
173
  "max_epochs": 18,
174
  "max_grad_norm": 1.0,
 
 
175
  "metric": {
176
  "active_trackers": [
177
  "jsonl",
178
+ "tensorboard"
179
  ],
180
  "grad_accumulation_steps": 1,
181
  "run_dir": "work_dirs",
 
185
  "mixed_precision_dtype": "bf16",
186
  "sampler": null,
187
  "tokenizer": {
188
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
189
  "type": "PretrainedTokenizer"
190
  },
191
  "type": "FSDPTrainRunner",
 
369
  "max_len": 600,
370
  "num_images": 2,
371
  "tokenizer": {
372
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
373
  "type": "PretrainedTokenizer"
374
  },
375
  "type": "ProcessPromptsWithImage"
 
427
  "proprio"
428
  ]
429
  },
430
+ "seed": 7,
431
  "statistic_keys": [
432
  "observation.state",
433
  "timestamp",
gr00t_eagle_3b_libero_10_full_finetune_bs64/config.yaml CHANGED
@@ -32,7 +32,7 @@ eval:
32
  - max_len: 600
33
  num_images: 2
34
  tokenizer:
35
- model_path: /limx/tos/users/liyinhao/projects/eagle2_hg_model
36
  type: PretrainedTokenizer
37
  type: ProcessPromptsWithImage
38
  - gripper_key: robot0_gripper_qpos
@@ -70,7 +70,6 @@ inference_model:
70
  num_layers: 16
71
  output_dim: 1024
72
  positional_embeddings: null
73
- use_torch_compile: true
74
  hidden_size: 1024
75
  input_embedding_dim: 1536
76
  num_heads: 4
@@ -132,12 +131,10 @@ runner:
132
  lr_scheduler_type: linear-warmup+cosine-decay
133
  max_epochs: 18
134
  max_grad_norm: 1.0
135
- max_keep_ckpts: 1
136
- max_steps: null
137
  metric:
138
  active_trackers:
139
  - jsonl
140
- - wandb
141
  grad_accumulation_steps: 1
142
  run_dir: work_dirs
143
  type: VLAMetric
@@ -145,7 +142,7 @@ runner:
145
  mixed_precision_dtype: bf16
146
  sampler: null
147
  tokenizer:
148
- model_path: /limx/tos/users/liyinhao/projects/eagle2_hg_model
149
  type: PretrainedTokenizer
150
  type: FSDPTrainRunner
151
  warmup_ratio: 0.03
@@ -296,7 +293,7 @@ train_dataloader:
296
  - max_len: 600
297
  num_images: 2
298
  tokenizer:
299
- model_path: /limx/tos/users/liyinhao/projects/eagle2_hg_model
300
  type: PretrainedTokenizer
301
  type: ProcessPromptsWithImage
302
  - height: 224
@@ -331,7 +328,7 @@ train_dataloader:
331
  - action
332
  observation.state:
333
  - proprio
334
- seed: 1
335
  statistic_keys:
336
  - observation.state
337
  - timestamp
 
32
  - max_len: 600
33
  num_images: 2
34
  tokenizer:
35
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
36
  type: PretrainedTokenizer
37
  type: ProcessPromptsWithImage
38
  - gripper_key: robot0_gripper_qpos
 
70
  num_layers: 16
71
  output_dim: 1024
72
  positional_embeddings: null
 
73
  hidden_size: 1024
74
  input_embedding_dim: 1536
75
  num_heads: 4
 
131
  lr_scheduler_type: linear-warmup+cosine-decay
132
  max_epochs: 18
133
  max_grad_norm: 1.0
 
 
134
  metric:
135
  active_trackers:
136
  - jsonl
137
+ - tensorboard
138
  grad_accumulation_steps: 1
139
  run_dir: work_dirs
140
  type: VLAMetric
 
142
  mixed_precision_dtype: bf16
143
  sampler: null
144
  tokenizer:
145
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
146
  type: PretrainedTokenizer
147
  type: FSDPTrainRunner
148
  warmup_ratio: 0.03
 
293
  - max_len: 600
294
  num_images: 2
295
  tokenizer:
296
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
297
  type: PretrainedTokenizer
298
  type: ProcessPromptsWithImage
299
  - height: 224
 
328
  - action
329
  observation.state:
330
  - proprio
331
+ seed: 7
332
  statistic_keys:
333
  - observation.state
334
  - timestamp
gr00t_eagle_3b_libero_10_full_finetune_bs64/gr00t_eagle_3b_libero_10_full_finetune_2026_05_14_02_41_08.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_10_full_finetune_bs64/run-metrics.jsonl CHANGED
@@ -1 +1 @@
1
- {"hparams": "{'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'dtype': 'bf16', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'use_torch_compile': True, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'seed': 1, 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'dataset_statistics': {'libero_10_no_noops': {'proprio': {'mean': [-0.0419132679050224, 0.034591788297521735, 0.8265881844959498, 2.90259518190321, -0.5570652600832564, -0.16592166873533284, 0.02845031351083622, -0.02880236273799356], 'std': [0.03756502182067285, 0.05091765880150317, 0.09107525593038836, 0.12327524826514363, 0.4418352294043351, 0.12490994022681218, 0.004662133639412193, 0.00460807817987938], 'min': [-0.48278069496154785, -0.3309336006641388, 0.44550687074661255, 1.1323540210723877, -3.6312508583068848, -1.842738389968872, -0.005453015677630901, -0.04112039878964424], 'max': [0.2103137969970703, 0.38887521624565125, 1.333192229270935, 3.7248642444610596, 3.5618896484375, 1.3863215446472168, 0.041575800627470016, 0.0013126095291227102], 'q01': [-0.1855636807291125, -0.16145669766439186, 0.7064185725262808, 2.5678211534702324, -1.2430377303522737, -0.5195810482339626, 0.01022917473133343, -0.03999379658232052], 'q99': [0.05938728483051665, 0.2361478409238694, 0.9397258571145816, 3.2118708728143526, 0.49082919816100534, 0.2100883989120329, 0.040047131839991014, -0.011104049991952391]}, 'timestamp': {'mean': [7.007510548523206], 'std': [4.457129586378845], 'min': [0.0], 'max': [25.2], 'q01': None, 'q99': None}, 'action': {'mean': [0.01905656634877842, 0.05672475971568838, -0.056239289430234256, 0.004756678478841528, 0.002797492338491304, -0.00714607048416358, 0.54599156235075], 'std': [0.10588348353857541, 0.13552477199270377, 0.13886650724555177, 0.01433739270759898, 0.02038583948325967, 0.033299202425577934, 0.1881810653484855], 'min': [-0.9375, -0.9375, -0.9375, -0.23642857372760773, -0.3053571283817291, -0.3642857074737549, 0.0], 'max': [0.9375, 0.9375, 0.9375, 0.32892856001853943, 0.36964285373687744, 0.375, 1.0], 'q01': [-0.4997477764535965, -0.6992653512084763, -0.6543309163615124, -0.07417070079989778, -0.11898748445770971, -0.15976085962510805, 0.0], 'q99': [0.658747846713789, 0.7333480638990948, 0.768601965587579, 0.09784501244893279, 0.12943469061349036, 0.15137893471596325, 1.0]}}}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': './work_dirs/73d1dcc4f_gr00t_eagle_3b_libero_10_full_finetune_bs64_seed1', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'dtype': 'bf16', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'use_torch_compile': True, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'seed': 1, 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'dataset_statistics': {'libero_10_no_noops': {'proprio': {'mean': [-0.0419132679050224, 0.034591788297521735, 0.8265881844959498, 2.90259518190321, -0.5570652600832564, -0.16592166873533284, 0.02845031351083622, -0.02880236273799356], 'std': [0.03756502182067285, 0.05091765880150317, 0.09107525593038836, 0.12327524826514363, 0.4418352294043351, 0.12490994022681218, 0.004662133639412193, 0.00460807817987938], 'min': [-0.48278069496154785, -0.3309336006641388, 0.44550687074661255, 1.1323540210723877, -3.6312508583068848, -1.842738389968872, -0.005453015677630901, -0.04112039878964424], 'max': [0.2103137969970703, 0.38887521624565125, 1.333192229270935, 3.7248642444610596, 3.5618896484375, 1.3863215446472168, 0.041575800627470016, 0.0013126095291227102], 'q01': [-0.1855636807291125, -0.16145669766439186, 0.7064185725262808, 2.5678211534702324, -1.2430377303522737, -0.5195810482339626, 0.01022917473133343, -0.03999379658232052], 'q99': [0.05938728483051665, 0.2361478409238694, 0.9397258571145816, 3.2118708728143526, 0.49082919816100534, 0.2100883989120329, 0.040047131839991014, -0.011104049991952391]}, 'timestamp': {'mean': [7.007510548523206], 'std': [4.457129586378845], 'min': [0.0], 'max': [25.2], 'q01': None, 'q99': None}, 'action': {'mean': [0.01905656634877842, 0.05672475971568838, -0.056239289430234256, 0.004756678478841528, 0.002797492338491304, -0.00714607048416358, 0.54599156235075], 'std': [0.10588348353857541, 0.13552477199270377, 0.13886650724555177, 0.01433739270759898, 0.02038583948325967, 0.033299202425577934, 0.1881810653484855], 'min': [-0.9375, -0.9375, -0.9375, -0.23642857372760773, -0.3053571283817291, -0.3642857074737549, 0.0], 'max': [0.9375, 0.9375, 0.9375, 0.32892856001853943, 0.36964285373687744, 0.375, 1.0], 'q01': [-0.4997477764535965, -0.6992653512084763, -0.6543309163615124, -0.07417070079989778, -0.11898748445770971, -0.15976085962510805, 0.0], 'q99': [0.658747846713789, 0.7333480638990948, 0.768601965587579, 0.09784501244893279, 0.12943469061349036, 0.15137893471596325, 1.0]}}}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': './work_dirs/73d1dcc4f_gr00t_eagle_3b_libero_10_full_finetune_bs64_seed1', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_10_full_finetune_2026_03_15_08_28_06'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'max_steps': None, 'max_keep_ckpts': 1, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py', work_dir='./work_dirs/73d1dcc4f_gr00t_eagle_3b_libero_10_full_finetune_bs64_seed1', cfg_options={'train_dataloader.per_device_batch_size': 8, 'runner.max_epochs': 18, 'runner.max_steps': None, 'runner.max_keep_ckpts': 1, 'train_dataloader.dataset.seed': 1}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_eagle_3b_libero_10_full_finetune_2026_03_15_08_28_06'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'max_steps': None, 'max_keep_ckpts': 1, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'dtype': 'bf16', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'use_torch_compile': True, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'seed': 1, 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'dataset_statistics': {'libero_10_no_noops': {'proprio': {'mean': [-0.0419132679050224, 0.034591788297521735, 0.8265881844959498, 2.90259518190321, -0.5570652600832564, -0.16592166873533284, 0.02845031351083622, -0.02880236273799356], 'std': [0.03756502182067285, 0.05091765880150317, 0.09107525593038836, 0.12327524826514363, 0.4418352294043351, 0.12490994022681218, 0.004662133639412193, 0.00460807817987938], 'min': [-0.48278069496154785, -0.3309336006641388, 0.44550687074661255, 1.1323540210723877, -3.6312508583068848, -1.842738389968872, -0.005453015677630901, -0.04112039878964424], 'max': [0.2103137969970703, 0.38887521624565125, 1.333192229270935, 3.7248642444610596, 3.5618896484375, 1.3863215446472168, 0.041575800627470016, 0.0013126095291227102], 'q01': [-0.1855636807291125, -0.16145669766439186, 0.7064185725262808, 2.5678211534702324, -1.2430377303522737, -0.5195810482339626, 0.01022917473133343, -0.03999379658232052], 'q99': [0.05938728483051665, 0.2361478409238694, 0.9397258571145816, 3.2118708728143526, 0.49082919816100534, 0.2100883989120329, 0.040047131839991014, -0.011104049991952391]}, 'timestamp': {'mean': [7.007510548523206], 'std': [4.457129586378845], 'min': [0.0], 'max': [25.2], 'q01': None, 'q99': None}, 'action': {'mean': [0.01905656634877842, 0.05672475971568838, -0.056239289430234256, 0.004756678478841528, 0.002797492338491304, -0.00714607048416358, 0.54599156235075], 'std': [0.10588348353857541, 0.13552477199270377, 0.13886650724555177, 0.01433739270759898, 0.02038583948325967, 0.033299202425577934, 0.1881810653484855], 'min': [-0.9375, -0.9375, -0.9375, -0.23642857372760773, -0.3053571283817291, -0.3642857074737549, 0.0], 'max': [0.9375, 0.9375, 0.9375, 0.32892856001853943, 0.36964285373687744, 0.375, 1.0], 'q01': [-0.4997477764535965, -0.6992653512084763, -0.6543309163615124, -0.07417070079989778, -0.11898748445770971, -0.15976085962510805, 0.0], 'q99': [0.658747846713789, 0.7333480638990948, 0.768601965587579, 0.09784501244893279, 0.12943469061349036, 0.15137893471596325, 1.0]}}}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': './work_dirs/73d1dcc4f_gr00t_eagle_3b_libero_10_full_finetune_bs64_seed1', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_10_full_finetune_2026_03_15_08_28_06'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'max_steps': None, 'max_keep_ckpts': 1, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py', work_dir='./work_dirs/73d1dcc4f_gr00t_eagle_3b_libero_10_full_finetune_bs64_seed1', cfg_options={'train_dataloader.per_device_batch_size': 8, 'runner.max_epochs': 18, 'runner.max_steps': None, 'runner.max_keep_ckpts': 1, 'train_dataloader.dataset.seed': 1}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py', work_dir='./work_dirs/73d1dcc4f_gr00t_eagle_3b_libero_10_full_finetune_bs64_seed1', cfg_options={'train_dataloader.per_device_batch_size': 8, 'runner.max_epochs': 18, 'runner.max_steps': None, 'runner.max_keep_ckpts': 1, 'train_dataloader.dataset.seed': 1}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': './checkpoints/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_eagle_3b_libero_10_full_finetune_2026_03_15_08_28_06"}
 
1
+ {"hparams": "{'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'dtype': 'bf16', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'seed': 7, 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'dataset_statistics': {'libero_10_no_noops': {'proprio': {'mean': [-0.0419132679050224, 0.034591788297521735, 0.8265881844959498, 2.90259518190321, -0.5570652600832564, -0.16592166873533284, 0.02845031351083622, -0.02880236273799356], 'std': [0.03756502182067285, 0.05091765880150317, 0.09107525593038836, 0.12327524826514363, 0.4418352294043351, 0.12490994022681218, 0.004662133639412193, 0.00460807817987938], 'min': [-0.48278069496154785, -0.3309336006641388, 0.44550687074661255, 1.1323540210723877, -3.6312508583068848, -1.842738389968872, -0.005453015677630901, -0.04112039878964424], 'max': [0.2103137969970703, 0.38887521624565125, 1.333192229270935, 3.7248642444610596, 3.5618896484375, 1.3863215446472168, 0.041575800627470016, 0.0013126095291227102], 'q01': [-0.1855636807291125, -0.16145669766439186, 0.7064185725262808, 2.5678211534702324, -1.2430377303522737, -0.5195810482339626, 0.01022917473133343, -0.03999379658232052], 'q99': [0.05938728483051665, 0.2361478409238694, 0.9397258571145816, 3.2118708728143526, 0.49082919816100534, 0.2100883989120329, 0.040047131839991014, -0.011104049991952391]}, 'timestamp': {'mean': [7.007510548523206], 'std': [4.457129586378845], 'min': [0.0], 'max': [25.2], 'q01': None, 'q99': None}, 'action': {'mean': [0.01905656634877842, 0.05672475971568838, -0.056239289430234256, 0.004756678478841528, 0.002797492338491304, -0.00714607048416358, 0.54599156235075], 'std': [0.10588348353857541, 0.13552477199270377, 0.13886650724555177, 0.01433739270759898, 0.02038583948325967, 0.033299202425577934, 0.1881810653484855], 'min': [-0.9375, -0.9375, -0.9375, -0.23642857372760773, -0.3053571283817291, -0.3642857074737549, 0.0], 'max': [0.9375, 0.9375, 0.9375, 0.32892856001853943, 0.36964285373687744, 0.375, 1.0], 'q01': [-0.4997477764535965, -0.6992653512084763, -0.6543309163615124, -0.07417070079989778, -0.11898748445770971, -0.15976085962510805, 0.0], 'q99': [0.658747846713789, 0.7333480638990948, 0.768601965587579, 0.09784501244893279, 0.12943469061349036, 0.15137893471596325, 1.0]}}}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_10', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'dtype': 'bf16', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'seed': 7, 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'dataset_statistics': {'libero_10_no_noops': {'proprio': {'mean': [-0.0419132679050224, 0.034591788297521735, 0.8265881844959498, 2.90259518190321, -0.5570652600832564, -0.16592166873533284, 0.02845031351083622, -0.02880236273799356], 'std': [0.03756502182067285, 0.05091765880150317, 0.09107525593038836, 0.12327524826514363, 0.4418352294043351, 0.12490994022681218, 0.004662133639412193, 0.00460807817987938], 'min': [-0.48278069496154785, -0.3309336006641388, 0.44550687074661255, 1.1323540210723877, -3.6312508583068848, -1.842738389968872, -0.005453015677630901, -0.04112039878964424], 'max': [0.2103137969970703, 0.38887521624565125, 1.333192229270935, 3.7248642444610596, 3.5618896484375, 1.3863215446472168, 0.041575800627470016, 0.0013126095291227102], 'q01': [-0.1855636807291125, -0.16145669766439186, 0.7064185725262808, 2.5678211534702324, -1.2430377303522737, -0.5195810482339626, 0.01022917473133343, -0.03999379658232052], 'q99': [0.05938728483051665, 0.2361478409238694, 0.9397258571145816, 3.2118708728143526, 0.49082919816100534, 0.2100883989120329, 0.040047131839991014, -0.011104049991952391]}, 'timestamp': {'mean': [7.007510548523206], 'std': [4.457129586378845], 'min': [0.0], 'max': [25.2], 'q01': None, 'q99': None}, 'action': {'mean': [0.01905656634877842, 0.05672475971568838, -0.056239289430234256, 0.004756678478841528, 0.002797492338491304, -0.00714607048416358, 0.54599156235075], 'std': [0.10588348353857541, 0.13552477199270377, 0.13886650724555177, 0.01433739270759898, 0.02038583948325967, 0.033299202425577934, 0.1881810653484855], 'min': [-0.9375, -0.9375, -0.9375, -0.23642857372760773, -0.3053571283817291, -0.3642857074737549, 0.0], 'max': [0.9375, 0.9375, 0.9375, 0.32892856001853943, 0.36964285373687744, 0.375, 1.0], 'q01': [-0.4997477764535965, -0.6992653512084763, -0.6543309163615124, -0.07417070079989778, -0.11898748445770971, -0.15976085962510805, 0.0], 'q99': [0.658747846713789, 0.7333480638990948, 0.768601965587579, 0.09784501244893279, 0.12943469061349036, 0.15137893471596325, 1.0]}}}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_10', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_10_full_finetune_2026_05_14_02_41_08'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_10', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_eagle_3b_libero_10_full_finetune_2026_05_14_02_41_08'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'dtype': 'bf16', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'seed': 7, 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'dataset_statistics': {'libero_10_no_noops': {'proprio': {'mean': [-0.0419132679050224, 0.034591788297521735, 0.8265881844959498, 2.90259518190321, -0.5570652600832564, -0.16592166873533284, 0.02845031351083622, -0.02880236273799356], 'std': [0.03756502182067285, 0.05091765880150317, 0.09107525593038836, 0.12327524826514363, 0.4418352294043351, 0.12490994022681218, 0.004662133639412193, 0.00460807817987938], 'min': [-0.48278069496154785, -0.3309336006641388, 0.44550687074661255, 1.1323540210723877, -3.6312508583068848, -1.842738389968872, -0.005453015677630901, -0.04112039878964424], 'max': [0.2103137969970703, 0.38887521624565125, 1.333192229270935, 3.7248642444610596, 3.5618896484375, 1.3863215446472168, 0.041575800627470016, 0.0013126095291227102], 'q01': [-0.1855636807291125, -0.16145669766439186, 0.7064185725262808, 2.5678211534702324, -1.2430377303522737, -0.5195810482339626, 0.01022917473133343, -0.03999379658232052], 'q99': [0.05938728483051665, 0.2361478409238694, 0.9397258571145816, 3.2118708728143526, 0.49082919816100534, 0.2100883989120329, 0.040047131839991014, -0.011104049991952391]}, 'timestamp': {'mean': [7.007510548523206], 'std': [4.457129586378845], 'min': [0.0], 'max': [25.2], 'q01': None, 'q99': None}, 'action': {'mean': [0.01905656634877842, 0.05672475971568838, -0.056239289430234256, 0.004756678478841528, 0.002797492338491304, -0.00714607048416358, 0.54599156235075], 'std': [0.10588348353857541, 0.13552477199270377, 0.13886650724555177, 0.01433739270759898, 0.02038583948325967, 0.033299202425577934, 0.1881810653484855], 'min': [-0.9375, -0.9375, -0.9375, -0.23642857372760773, -0.3053571283817291, -0.3642857074737549, 0.0], 'max': [0.9375, 0.9375, 0.9375, 0.32892856001853943, 0.36964285373687744, 0.375, 1.0], 'q01': [-0.4997477764535965, -0.6992653512084763, -0.6543309163615124, -0.07417070079989778, -0.11898748445770971, -0.15976085962510805, 0.0], 'q99': [0.658747846713789, 0.7333480638990948, 0.768601965587579, 0.09784501244893279, 0.12943469061349036, 0.15137893471596325, 1.0]}}}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_10', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_10_full_finetune_2026_05_14_02_41_08'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_10', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_10_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_10', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_eagle_3b_libero_10_full_finetune_2026_05_14_02_41_08"}
gr00t_eagle_3b_libero_10_full_finetune_bs64/tokenizer/merges.txt CHANGED
@@ -151385,4 +151385,4 @@ krä fte
151385
  áķ ·
151386
  âį ¨
151387
  ⺠Ł
151388
- â½ Ĺ
 
151385
  áķ ·
151386
  âį ¨
151387
  ⺠Ł
151388
+ â½ Ĺ
gr00t_eagle_3b_libero_goal_full_finetune_bs64/checkpoints/step-014886-epoch-18-loss=0.0550.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5a6817ea595796a6cc1ee83363141da7e2d3f519183c7addfc37696a2895162
3
+ size 10896783888
gr00t_eagle_3b_libero_goal_full_finetune_bs64/config.json ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval": {
3
+ "dataset": {
4
+ "transforms": [
5
+ {
6
+ "embodiment_id": 2,
7
+ "img_keys": [
8
+ "agentview_image",
9
+ "robot0_eye_in_hand_image"
10
+ ],
11
+ "type": "ProcessLiberoEvalInputs"
12
+ },
13
+ {
14
+ "image_resize_strategy": "resize-naive",
15
+ "input_sizes": [
16
+ [
17
+ 3,
18
+ 224,
19
+ 224
20
+ ],
21
+ [
22
+ 3,
23
+ 224,
24
+ 224
25
+ ]
26
+ ],
27
+ "means": [
28
+ [
29
+ 123.515625,
30
+ 116.04492188,
31
+ 103.59375
32
+ ],
33
+ [
34
+ 123.515625,
35
+ 116.04492188,
36
+ 103.59375
37
+ ]
38
+ ],
39
+ "stds": [
40
+ [
41
+ 58.27148438,
42
+ 57.02636719,
43
+ 57.27539062
44
+ ],
45
+ [
46
+ 58.27148438,
47
+ 57.02636719,
48
+ 57.27539062
49
+ ]
50
+ ],
51
+ "type": "TransformImage"
52
+ },
53
+ {
54
+ "max_len": 600,
55
+ "num_images": 2,
56
+ "tokenizer": {
57
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
58
+ "type": "PretrainedTokenizer"
59
+ },
60
+ "type": "ProcessPromptsWithImage"
61
+ },
62
+ {
63
+ "gripper_key": "robot0_gripper_qpos",
64
+ "norm_type": "mean_std",
65
+ "out_key": "states",
66
+ "pos_key": "robot0_eef_pos",
67
+ "quat_key": "robot0_eef_quat",
68
+ "state_dim": 64,
69
+ "type": "LiberoProprioFromInputs"
70
+ }
71
+ ],
72
+ "type": "LiberoParquetEvalDataset"
73
+ },
74
+ "denormalize_action": {
75
+ "norm_type": "mean_std",
76
+ "type": "DenormalizeLiberoAction"
77
+ },
78
+ "eval_chunk_size": 10,
79
+ "model_family": "pi0",
80
+ "num_steps_wait": 10,
81
+ "num_trials_per_task": 50,
82
+ "resize_size": 224,
83
+ "seed": 7,
84
+ "task_suite_name": "libero_goal",
85
+ "type": "LiberoEvalRunner"
86
+ },
87
+ "inference_model": {
88
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
89
+ "type": "LlavaVLA",
90
+ "vla_head": {
91
+ "action_dim": 32,
92
+ "diffusion_model_cfg": {
93
+ "attention_head_dim": 48,
94
+ "cross_attention_dim": 2048,
95
+ "dropout": 0.2,
96
+ "final_dropout": true,
97
+ "interleave_self_attention": true,
98
+ "norm_type": "ada_norm",
99
+ "num_attention_heads": 32,
100
+ "num_layers": 16,
101
+ "output_dim": 1024,
102
+ "positional_embeddings": null
103
+ },
104
+ "hidden_size": 1024,
105
+ "input_embedding_dim": 1536,
106
+ "num_heads": 4,
107
+ "num_inference_timesteps": 4,
108
+ "num_layers": 1,
109
+ "ori_action_dim": 7,
110
+ "state_dim": 64,
111
+ "traj_length": 10,
112
+ "type": "FlowMatchingInferenceHead"
113
+ },
114
+ "vlm_backbone": {
115
+ "type": "EagleInferenceBackbone",
116
+ "vlm_path": "fluxvla/models/third_party_models/eagle2_hg_model"
117
+ }
118
+ },
119
+ "model": {
120
+ "freeze_projector": false,
121
+ "freeze_vlm_backbone": false,
122
+ "name_mapping": {
123
+ "vla_head": "action_head",
124
+ "vlm_backbone.vlm": "backbone.eagle_model"
125
+ },
126
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
127
+ "type": "LlavaVLA",
128
+ "vla_head": {
129
+ "action_dim": 32,
130
+ "hidden_size": 1024,
131
+ "input_embedding_dim": 1536,
132
+ "num_heads": 4,
133
+ "num_inference_timesteps": 4,
134
+ "num_layers": 1,
135
+ "ori_action_dim": 7,
136
+ "state_dim": 64,
137
+ "traj_length": 10,
138
+ "type": "FlowMatchingHead"
139
+ },
140
+ "vlm_backbone": {
141
+ "type": "EagleBackbone",
142
+ "vlm_path": "fluxvla/models/third_party_models/eagle2_hg_model"
143
+ }
144
+ },
145
+ "runner": {
146
+ "change_key_name": false,
147
+ "collator": {
148
+ "keys": [
149
+ "states",
150
+ "observation.eepose",
151
+ "timestamp",
152
+ "images",
153
+ "img_masks",
154
+ "lang_tokens",
155
+ "lang_masks",
156
+ "actions",
157
+ "action_masks",
158
+ "embodiment_ids"
159
+ ],
160
+ "meta_keys": [
161
+ "task_description",
162
+ "prompt",
163
+ "info",
164
+ "stats"
165
+ ],
166
+ "type": "DictCollator"
167
+ },
168
+ "enable_gradient_checkpointing": false,
169
+ "enable_mixed_precision_training": true,
170
+ "learning_rate": 1.5e-05,
171
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
172
+ "max_epochs": 18,
173
+ "max_grad_norm": 1.0,
174
+ "metric": {
175
+ "active_trackers": [
176
+ "jsonl",
177
+ "tensorboard"
178
+ ],
179
+ "grad_accumulation_steps": 1,
180
+ "run_dir": "work_dirs",
181
+ "type": "VLAMetric",
182
+ "window_size": 1
183
+ },
184
+ "mixed_precision_dtype": "bf16",
185
+ "sampler": null,
186
+ "tokenizer": {
187
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
188
+ "type": "PretrainedTokenizer"
189
+ },
190
+ "type": "FSDPTrainRunner",
191
+ "warmup_ratio": 0.03,
192
+ "weight_decay": 0.0
193
+ },
194
+ "train_dataloader": {
195
+ "dataset": {
196
+ "datasets": {
197
+ "action_key": "action",
198
+ "action_window_size": 10,
199
+ "data_root_path": "datasets/libero_goal_no_noops_lerobotv2.1",
200
+ "statistic_name": "libero_goal_no_noops",
201
+ "transforms": [
202
+ {
203
+ "embodiment_id": 2,
204
+ "name_mappings": {
205
+ "actions": [
206
+ "actions"
207
+ ],
208
+ "observation.state": [
209
+ "states"
210
+ ]
211
+ },
212
+ "parquet_keys": [
213
+ "observation.state",
214
+ "timestamp",
215
+ "actions",
216
+ "info",
217
+ "stats",
218
+ "action_masks"
219
+ ],
220
+ "type": "ProcessParquetInputs",
221
+ "video_keys": [
222
+ "observation.images.image",
223
+ "observation.images.wrist_image"
224
+ ]
225
+ },
226
+ {
227
+ "type": "ParquetPrompter"
228
+ },
229
+ {
230
+ "max_len": 600,
231
+ "num_images": 2,
232
+ "tokenizer": {
233
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
234
+ "type": "PretrainedTokenizer"
235
+ },
236
+ "type": "ProcessPromptsWithImage"
237
+ },
238
+ {
239
+ "height": 224,
240
+ "type": "ResizeImages",
241
+ "width": 224
242
+ },
243
+ {
244
+ "means": [
245
+ [
246
+ 123.515625,
247
+ 116.04492188,
248
+ 103.59375
249
+ ],
250
+ [
251
+ 123.515625,
252
+ 116.04492188,
253
+ 103.59375
254
+ ]
255
+ ],
256
+ "stds": [
257
+ [
258
+ 58.27148438,
259
+ 57.02636719,
260
+ 57.27539062
261
+ ],
262
+ [
263
+ 58.27148438,
264
+ 57.02636719,
265
+ 57.27539062
266
+ ]
267
+ ],
268
+ "type": "NormalizeImages"
269
+ },
270
+ {
271
+ "action_dim": 32,
272
+ "action_key": "action",
273
+ "norm_type": "mean_std",
274
+ "state_dim": 64,
275
+ "state_key": "proprio",
276
+ "type": "NormalizeStatesAndActions"
277
+ }
278
+ ],
279
+ "type": "ParquetDataset",
280
+ "use_delta": false,
281
+ "window_start_idx": 0
282
+ },
283
+ "name_mappings": {
284
+ "action": [
285
+ "action"
286
+ ],
287
+ "observation.state": [
288
+ "proprio"
289
+ ]
290
+ },
291
+ "statistic_keys": [
292
+ "observation.state",
293
+ "timestamp",
294
+ "action"
295
+ ],
296
+ "statistic_name": "libero_goal_no_noops",
297
+ "type": "DistributedRepeatingDataset"
298
+ },
299
+ "per_device_batch_size": 8,
300
+ "per_device_num_workers": 4
301
+ }
302
+ }
gr00t_eagle_3b_libero_goal_full_finetune_bs64/config.yaml ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ eval:
2
+ dataset:
3
+ transforms:
4
+ - embodiment_id: 2
5
+ img_keys:
6
+ - agentview_image
7
+ - robot0_eye_in_hand_image
8
+ type: ProcessLiberoEvalInputs
9
+ - image_resize_strategy: resize-naive
10
+ input_sizes:
11
+ - - 3
12
+ - 224
13
+ - 224
14
+ - - 3
15
+ - 224
16
+ - 224
17
+ means:
18
+ - - 123.515625
19
+ - 116.04492188
20
+ - 103.59375
21
+ - - 123.515625
22
+ - 116.04492188
23
+ - 103.59375
24
+ stds:
25
+ - - 58.27148438
26
+ - 57.02636719
27
+ - 57.27539062
28
+ - - 58.27148438
29
+ - 57.02636719
30
+ - 57.27539062
31
+ type: TransformImage
32
+ - max_len: 600
33
+ num_images: 2
34
+ tokenizer:
35
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
36
+ type: PretrainedTokenizer
37
+ type: ProcessPromptsWithImage
38
+ - gripper_key: robot0_gripper_qpos
39
+ norm_type: mean_std
40
+ out_key: states
41
+ pos_key: robot0_eef_pos
42
+ quat_key: robot0_eef_quat
43
+ state_dim: 64
44
+ type: LiberoProprioFromInputs
45
+ type: LiberoParquetEvalDataset
46
+ denormalize_action:
47
+ norm_type: mean_std
48
+ type: DenormalizeLiberoAction
49
+ eval_chunk_size: 10
50
+ model_family: pi0
51
+ num_steps_wait: 10
52
+ num_trials_per_task: 50
53
+ resize_size: 224
54
+ seed: 7
55
+ task_suite_name: libero_goal
56
+ type: LiberoEvalRunner
57
+ inference_model:
58
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
59
+ type: LlavaVLA
60
+ vla_head:
61
+ action_dim: 32
62
+ diffusion_model_cfg:
63
+ attention_head_dim: 48
64
+ cross_attention_dim: 2048
65
+ dropout: 0.2
66
+ final_dropout: true
67
+ interleave_self_attention: true
68
+ norm_type: ada_norm
69
+ num_attention_heads: 32
70
+ num_layers: 16
71
+ output_dim: 1024
72
+ positional_embeddings: null
73
+ hidden_size: 1024
74
+ input_embedding_dim: 1536
75
+ num_heads: 4
76
+ num_inference_timesteps: 4
77
+ num_layers: 1
78
+ ori_action_dim: 7
79
+ state_dim: 64
80
+ traj_length: 10
81
+ type: FlowMatchingInferenceHead
82
+ vlm_backbone:
83
+ type: EagleInferenceBackbone
84
+ vlm_path: fluxvla/models/third_party_models/eagle2_hg_model
85
+ model:
86
+ freeze_projector: false
87
+ freeze_vlm_backbone: false
88
+ name_mapping:
89
+ vla_head: action_head
90
+ vlm_backbone.vlm: backbone.eagle_model
91
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
92
+ type: LlavaVLA
93
+ vla_head:
94
+ action_dim: 32
95
+ hidden_size: 1024
96
+ input_embedding_dim: 1536
97
+ num_heads: 4
98
+ num_inference_timesteps: 4
99
+ num_layers: 1
100
+ ori_action_dim: 7
101
+ state_dim: 64
102
+ traj_length: 10
103
+ type: FlowMatchingHead
104
+ vlm_backbone:
105
+ type: EagleBackbone
106
+ vlm_path: fluxvla/models/third_party_models/eagle2_hg_model
107
+ runner:
108
+ change_key_name: false
109
+ collator:
110
+ keys:
111
+ - states
112
+ - observation.eepose
113
+ - timestamp
114
+ - images
115
+ - img_masks
116
+ - lang_tokens
117
+ - lang_masks
118
+ - actions
119
+ - action_masks
120
+ - embodiment_ids
121
+ meta_keys:
122
+ - task_description
123
+ - prompt
124
+ - info
125
+ - stats
126
+ type: DictCollator
127
+ enable_gradient_checkpointing: false
128
+ enable_mixed_precision_training: true
129
+ learning_rate: 1.5e-05
130
+ lr_scheduler_type: linear-warmup+cosine-decay
131
+ max_epochs: 18
132
+ max_grad_norm: 1.0
133
+ metric:
134
+ active_trackers:
135
+ - jsonl
136
+ - tensorboard
137
+ grad_accumulation_steps: 1
138
+ run_dir: work_dirs
139
+ type: VLAMetric
140
+ window_size: 1
141
+ mixed_precision_dtype: bf16
142
+ sampler: null
143
+ tokenizer:
144
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
145
+ type: PretrainedTokenizer
146
+ type: FSDPTrainRunner
147
+ warmup_ratio: 0.03
148
+ weight_decay: 0.0
149
+ train_dataloader:
150
+ dataset:
151
+ datasets:
152
+ action_key: action
153
+ action_window_size: 10
154
+ data_root_path: datasets/libero_goal_no_noops_lerobotv2.1
155
+ statistic_name: libero_goal_no_noops
156
+ transforms:
157
+ - embodiment_id: 2
158
+ name_mappings:
159
+ actions:
160
+ - actions
161
+ observation.state:
162
+ - states
163
+ parquet_keys:
164
+ - observation.state
165
+ - timestamp
166
+ - actions
167
+ - info
168
+ - stats
169
+ - action_masks
170
+ type: ProcessParquetInputs
171
+ video_keys:
172
+ - observation.images.image
173
+ - observation.images.wrist_image
174
+ - type: ParquetPrompter
175
+ - max_len: 600
176
+ num_images: 2
177
+ tokenizer:
178
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
179
+ type: PretrainedTokenizer
180
+ type: ProcessPromptsWithImage
181
+ - height: 224
182
+ type: ResizeImages
183
+ width: 224
184
+ - means:
185
+ - - 123.515625
186
+ - 116.04492188
187
+ - 103.59375
188
+ - - 123.515625
189
+ - 116.04492188
190
+ - 103.59375
191
+ stds:
192
+ - - 58.27148438
193
+ - 57.02636719
194
+ - 57.27539062
195
+ - - 58.27148438
196
+ - 57.02636719
197
+ - 57.27539062
198
+ type: NormalizeImages
199
+ - action_dim: 32
200
+ action_key: action
201
+ norm_type: mean_std
202
+ state_dim: 64
203
+ state_key: proprio
204
+ type: NormalizeStatesAndActions
205
+ type: ParquetDataset
206
+ use_delta: false
207
+ window_start_idx: 0
208
+ name_mappings:
209
+ action:
210
+ - action
211
+ observation.state:
212
+ - proprio
213
+ statistic_keys:
214
+ - observation.state
215
+ - timestamp
216
+ - action
217
+ statistic_name: libero_goal_no_noops
218
+ type: DistributedRepeatingDataset
219
+ per_device_batch_size: 8
220
+ per_device_num_workers: 4
gr00t_eagle_3b_libero_goal_full_finetune_bs64/dataset_statistics.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_goal_no_noops": {
3
+ "proprio": {
4
+ "mean": [
5
+ -0.09891432758878499,
6
+ 0.01489584178884142,
7
+ 1.067322519907531,
8
+ 2.8289916028867363,
9
+ 0.31907902813946676,
10
+ -0.2782741362135122,
11
+ 0.02821884616217808,
12
+ -0.02719759491844234
13
+ ],
14
+ "std": [
15
+ 0.04140466877133655,
16
+ 0.04117920944028847,
17
+ 0.037538661473221754,
18
+ 0.1955033740750787,
19
+ 0.2574273555156476,
20
+ 0.12620857483432396,
21
+ 0.0053229405182870755,
22
+ 0.005316267734440966
23
+ ],
24
+ "min": [
25
+ -0.46141287684440613,
26
+ -0.30136311054229736,
27
+ 0.9083037972450256,
28
+ 1.002794623374939,
29
+ -1.0517308712005615,
30
+ -1.5227035284042358,
31
+ -0.0021671096328645945,
32
+ -0.042015016078948975
33
+ ],
34
+ "max": [
35
+ 0.13241106271743774,
36
+ 0.3271525800228119,
37
+ 1.472778081893921,
38
+ 3.4731650352478027,
39
+ 2.676265239715576,
40
+ 0.6698114275932312,
41
+ 0.04232141748070717,
42
+ 0.001021005678921938
43
+ ],
44
+ "q01": [
45
+ -0.22800911694063627,
46
+ -0.10299188974829282,
47
+ 0.9455820491176684,
48
+ 2.608259821258135,
49
+ -0.18011099436472794,
50
+ -0.5618953405895196,
51
+ 0.012391739034799726,
52
+ -0.039969403267763146
53
+ ],
54
+ "q99": [
55
+ -0.01061282617817482,
56
+ 0.11139527808191847,
57
+ 1.2117906450921032,
58
+ 3.168615021869246,
59
+ 0.6706572281431679,
60
+ 0.05441701961452796,
61
+ 0.04007683324960615,
62
+ -0.009863127064877238
63
+ ]
64
+ },
65
+ "timestamp": {
66
+ "mean": [
67
+ 3.354542962472823
68
+ ],
69
+ "std": [
70
+ 2.391036718656464
71
+ ],
72
+ "min": [
73
+ 0.0
74
+ ],
75
+ "max": [
76
+ 17.3
77
+ ],
78
+ "q01": null,
79
+ "q99": null
80
+ },
81
+ "action": {
82
+ "mean": [
83
+ 0.04244028081958392,
84
+ 0.03443110282231447,
85
+ -0.15229553502677498,
86
+ -0.0024877518145540465,
87
+ 0.02584054000286765,
88
+ 0.026984970605938637,
89
+ 0.6345212227794035
90
+ ],
91
+ "std": [
92
+ 0.15081003273695404,
93
+ 0.13262089326077886,
94
+ 0.18549323777289492,
95
+ 0.020653428159559374,
96
+ 0.029405301079120767,
97
+ 0.03768659701327122,
98
+ 0.1820141409830254
99
+ ],
100
+ "min": [
101
+ -0.9375,
102
+ -0.9375,
103
+ -0.9375,
104
+ -0.24214285612106323,
105
+ -0.375,
106
+ -0.2871428430080414,
107
+ 0.0
108
+ ],
109
+ "max": [
110
+ 0.9375,
111
+ 0.9375,
112
+ 0.9375,
113
+ 0.3557142913341522,
114
+ 0.375,
115
+ 0.375,
116
+ 1.0
117
+ ],
118
+ "q01": [
119
+ -0.6000398242585724,
120
+ -0.4840628973395442,
121
+ -0.828284557604454,
122
+ -0.08435403729704126,
123
+ -0.10102247173430137,
124
+ -0.08468755117369905,
125
+ 0.22234615748180359
126
+ ],
127
+ "q99": [
128
+ 0.7428644945783142,
129
+ 0.5763548859545187,
130
+ 0.6755035821278529,
131
+ 0.09240677347557615,
132
+ 0.17272708537297377,
133
+ 0.14638082286503087,
134
+ 1.0
135
+ ]
136
+ }
137
+ }
138
+ }
gr00t_eagle_3b_libero_goal_full_finetune_bs64/gr00t_eagle_3b_libero_goal_full_finetune_2026_05_14_02_40_40.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_goal_full_finetune_bs64/run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": "{'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_goal_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_goal_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_goal_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_goal', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_goal_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_goal_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_goal_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_goal', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_goal_full_finetune_2026_05_14_02_40_40'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_goal', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_goal', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_eagle_3b_libero_goal_full_finetune_2026_05_14_02_40_40'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_goal_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_goal_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_goal_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_goal', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_goal_full_finetune_2026_05_14_02_40_40'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_goal', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_goal', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_goal_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_goal', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_goal', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_eagle_3b_libero_goal_full_finetune_2026_05_14_02_40_40"}
gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151671,
4
+ "</interval>": 151679,
5
+ "</quad>": 151675,
6
+ "</ref>": 151677,
7
+ "</think>": 151668,
8
+ "</tool_call>": 151658,
9
+ "</tool_response>": 151666,
10
+ "<IMG_CONTEXT>": 151669,
11
+ "<box>": 151672,
12
+ "<img>": 151670,
13
+ "<interval>": 151678,
14
+ "<quad>": 151674,
15
+ "<ref>": 151676,
16
+ "<think>": 151667,
17
+ "<tool_call>": 151657,
18
+ "<tool_response>": 151665,
19
+ "<|box_end|>": 151649,
20
+ "<|box_start|>": 151648,
21
+ "<|endoftext|>": 151643,
22
+ "<|file_sep|>": 151664,
23
+ "<|fim_middle|>": 151660,
24
+ "<|fim_pad|>": 151662,
25
+ "<|fim_prefix|>": 151659,
26
+ "<|fim_suffix|>": 151661,
27
+ "<|im_end|>": 151645,
28
+ "<|im_start|>": 151644,
29
+ "<|image_pad|>": 151655,
30
+ "<|object_ref_end|>": 151647,
31
+ "<|object_ref_start|>": 151646,
32
+ "<|quad_end|>": 151651,
33
+ "<|quad_start|>": 151650,
34
+ "<|repo_name|>": 151663,
35
+ "<|video_pad|>": 151656,
36
+ "<|vision_end|>": 151653,
37
+ "<|vision_pad|>": 151654,
38
+ "<|vision_start|>": 151652
39
+ }
gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<IMG_CONTEXT>",
17
+ "<img>",
18
+ "</img>",
19
+ "<box>",
20
+ "</box>",
21
+ "<quad>",
22
+ "</quad>",
23
+ "<ref>",
24
+ "</ref>",
25
+ "<interval>",
26
+ "</interval>"
27
+ ],
28
+ "eos_token": {
29
+ "content": "<|im_end|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<IMG_CONTEXT>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<img>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</img>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<quad>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</quad>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<ref>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "</ref>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<interval>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "151679": {
295
+ "content": "</interval>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ }
302
+ },
303
+ "additional_special_tokens": [
304
+ "<|im_start|>",
305
+ "<|im_end|>",
306
+ "<|object_ref_start|>",
307
+ "<|object_ref_end|>",
308
+ "<|box_start|>",
309
+ "<|box_end|>",
310
+ "<|quad_start|>",
311
+ "<|quad_end|>",
312
+ "<|vision_start|>",
313
+ "<|vision_end|>",
314
+ "<|vision_pad|>",
315
+ "<|image_pad|>",
316
+ "<|video_pad|>",
317
+ "<IMG_CONTEXT>",
318
+ "<img>",
319
+ "</img>",
320
+ "<box>",
321
+ "</box>",
322
+ "<quad>",
323
+ "</quad>",
324
+ "<ref>",
325
+ "</ref>",
326
+ "<interval>",
327
+ "</interval>"
328
+ ],
329
+ "auto_map": {
330
+ "AutoProcessor": "processing_eagle2_5_vl.Eagle2_5_VLProcessor"
331
+ },
332
+ "bos_token": null,
333
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
334
+ "clean_up_tokenization_spaces": false,
335
+ "eos_token": "<|im_end|>",
336
+ "errors": "replace",
337
+ "extra_special_tokens": {},
338
+ "model_max_length": 16384,
339
+ "pad_token": "<|endoftext|>",
340
+ "processor_class": "Eagle2_5_VLProcessor",
341
+ "split_special_tokens": false,
342
+ "tokenizer_class": "Qwen2Tokenizer",
343
+ "unk_token": null
344
+ }
gr00t_eagle_3b_libero_goal_full_finetune_bs64/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_goal_full_finetune_bs64/vlm_backbone_config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation": "flash_attention_2",
3
+ "architectures": [
4
+ "Eagle2_5_VLForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_eagle2_5_vl.Eagle2_5_VLConfig",
8
+ "AutoModel": "modeling_eagle2_5_vl.Eagle2_5_VLForConditionalGeneration"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "dynamic_image_size": true,
12
+ "force_image_size": 224,
13
+ "image_token_index": 151669,
14
+ "initializer_range": 0.02,
15
+ "loss_version": "efficient_v2_cp_head",
16
+ "max_dynamic_tiles": 12,
17
+ "min_dynamic_tiles": 1,
18
+ "mlp_checkpoint": false,
19
+ "mlp_connector_layers": 1,
20
+ "model_type": "eagle_2_5_vl",
21
+ "output_attentions": false,
22
+ "pad2square": false,
23
+ "select_layer": -1,
24
+ "template": "qwen3-chat",
25
+ "text_config": {
26
+ "_name_or_path": "Qwen/Qwen3-1.7B",
27
+ "architectures": [
28
+ "Qwen3ForCausalLM"
29
+ ],
30
+ "attention_bias": false,
31
+ "attention_dropout": 0,
32
+ "bos_token_id": 151643,
33
+ "eos_token_id": 151645,
34
+ "head_dim": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 2048,
37
+ "initializer_range": 0.02,
38
+ "intermediate_size": 6144,
39
+ "layer_types": [
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention"
68
+ ],
69
+ "max_position_embeddings": 40960,
70
+ "max_window_layers": 28,
71
+ "model_type": "qwen3",
72
+ "num_attention_heads": 16,
73
+ "num_hidden_layers": 12,
74
+ "num_key_value_heads": 8,
75
+ "rms_norm_eps": 1e-06,
76
+ "rope_scaling": null,
77
+ "rope_theta": 1000000,
78
+ "sliding_window": null,
79
+ "tie_word_embeddings": true,
80
+ "torch_dtype": "bfloat16",
81
+ "use_cache": false,
82
+ "use_sliding_window": false,
83
+ "vocab_size": 151680
84
+ },
85
+ "tie_word_embeddings": true,
86
+ "torch_dtype": "bfloat16",
87
+ "transformers_version": null,
88
+ "use_backbone_lora": 0,
89
+ "use_llm_lora": 0,
90
+ "use_pixel_shuffle": false,
91
+ "use_thumbnail": true,
92
+ "vision_config": {
93
+ "attention_dropout": 0,
94
+ "hidden_act": "gelu_pytorch_tanh",
95
+ "hidden_size": 1152,
96
+ "image_size": 224,
97
+ "intermediate_size": 4304,
98
+ "layer_norm_eps": 1e-06,
99
+ "model_type": "siglip_vision_model",
100
+ "num_attention_heads": 16,
101
+ "num_channels": 3,
102
+ "num_hidden_layers": 27,
103
+ "patch_size": 14,
104
+ "torch_dtype": "bfloat16"
105
+ }
106
+ }
gr00t_eagle_3b_libero_object_full_finetune_bs64/checkpoints/step-018846-epoch-18-loss=0.0701.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e63051195907f4e25fc0a6dc317a1306643faaa4cbfd8677c8d4ecf3f627b8c6
3
+ size 10896783888
gr00t_eagle_3b_libero_object_full_finetune_bs64/config.json ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval": {
3
+ "dataset": {
4
+ "transforms": [
5
+ {
6
+ "embodiment_id": 2,
7
+ "img_keys": [
8
+ "agentview_image",
9
+ "robot0_eye_in_hand_image"
10
+ ],
11
+ "type": "ProcessLiberoEvalInputs"
12
+ },
13
+ {
14
+ "image_resize_strategy": "resize-naive",
15
+ "input_sizes": [
16
+ [
17
+ 3,
18
+ 224,
19
+ 224
20
+ ],
21
+ [
22
+ 3,
23
+ 224,
24
+ 224
25
+ ]
26
+ ],
27
+ "means": [
28
+ [
29
+ 123.515625,
30
+ 116.04492188,
31
+ 103.59375
32
+ ],
33
+ [
34
+ 123.515625,
35
+ 116.04492188,
36
+ 103.59375
37
+ ]
38
+ ],
39
+ "stds": [
40
+ [
41
+ 58.27148438,
42
+ 57.02636719,
43
+ 57.27539062
44
+ ],
45
+ [
46
+ 58.27148438,
47
+ 57.02636719,
48
+ 57.27539062
49
+ ]
50
+ ],
51
+ "type": "TransformImage"
52
+ },
53
+ {
54
+ "max_len": 600,
55
+ "num_images": 2,
56
+ "tokenizer": {
57
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
58
+ "type": "PretrainedTokenizer"
59
+ },
60
+ "type": "ProcessPromptsWithImage"
61
+ },
62
+ {
63
+ "gripper_key": "robot0_gripper_qpos",
64
+ "norm_type": "mean_std",
65
+ "out_key": "states",
66
+ "pos_key": "robot0_eef_pos",
67
+ "quat_key": "robot0_eef_quat",
68
+ "state_dim": 64,
69
+ "type": "LiberoProprioFromInputs"
70
+ }
71
+ ],
72
+ "type": "LiberoParquetEvalDataset"
73
+ },
74
+ "denormalize_action": {
75
+ "norm_type": "mean_std",
76
+ "type": "DenormalizeLiberoAction"
77
+ },
78
+ "eval_chunk_size": 10,
79
+ "model_family": "pi0",
80
+ "num_steps_wait": 10,
81
+ "num_trials_per_task": 50,
82
+ "resize_size": 224,
83
+ "seed": 7,
84
+ "task_suite_name": "libero_object",
85
+ "type": "LiberoEvalRunner"
86
+ },
87
+ "inference_model": {
88
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
89
+ "type": "LlavaVLA",
90
+ "vla_head": {
91
+ "action_dim": 32,
92
+ "diffusion_model_cfg": {
93
+ "attention_head_dim": 48,
94
+ "cross_attention_dim": 2048,
95
+ "dropout": 0.2,
96
+ "final_dropout": true,
97
+ "interleave_self_attention": true,
98
+ "norm_type": "ada_norm",
99
+ "num_attention_heads": 32,
100
+ "num_layers": 16,
101
+ "output_dim": 1024,
102
+ "positional_embeddings": null
103
+ },
104
+ "hidden_size": 1024,
105
+ "input_embedding_dim": 1536,
106
+ "num_heads": 4,
107
+ "num_inference_timesteps": 4,
108
+ "num_layers": 1,
109
+ "ori_action_dim": 7,
110
+ "state_dim": 64,
111
+ "traj_length": 10,
112
+ "type": "FlowMatchingInferenceHead"
113
+ },
114
+ "vlm_backbone": {
115
+ "type": "EagleInferenceBackbone",
116
+ "vlm_path": "fluxvla/models/third_party_models/eagle2_hg_model"
117
+ }
118
+ },
119
+ "model": {
120
+ "freeze_projector": false,
121
+ "freeze_vlm_backbone": false,
122
+ "name_mapping": {
123
+ "vla_head": "action_head",
124
+ "vlm_backbone.vlm": "backbone.eagle_model"
125
+ },
126
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
127
+ "type": "LlavaVLA",
128
+ "vla_head": {
129
+ "action_dim": 32,
130
+ "hidden_size": 1024,
131
+ "input_embedding_dim": 1536,
132
+ "num_heads": 4,
133
+ "num_inference_timesteps": 4,
134
+ "num_layers": 1,
135
+ "ori_action_dim": 7,
136
+ "state_dim": 64,
137
+ "traj_length": 10,
138
+ "type": "FlowMatchingHead"
139
+ },
140
+ "vlm_backbone": {
141
+ "type": "EagleBackbone",
142
+ "vlm_path": "fluxvla/models/third_party_models/eagle2_hg_model"
143
+ }
144
+ },
145
+ "runner": {
146
+ "change_key_name": false,
147
+ "collator": {
148
+ "keys": [
149
+ "states",
150
+ "observation.eepose",
151
+ "timestamp",
152
+ "images",
153
+ "img_masks",
154
+ "lang_tokens",
155
+ "lang_masks",
156
+ "actions",
157
+ "action_masks",
158
+ "embodiment_ids"
159
+ ],
160
+ "meta_keys": [
161
+ "task_description",
162
+ "prompt",
163
+ "info",
164
+ "stats"
165
+ ],
166
+ "type": "DictCollator"
167
+ },
168
+ "enable_gradient_checkpointing": false,
169
+ "enable_mixed_precision_training": true,
170
+ "learning_rate": 1.5e-05,
171
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
172
+ "max_epochs": 18,
173
+ "max_grad_norm": 1.0,
174
+ "metric": {
175
+ "active_trackers": [
176
+ "jsonl",
177
+ "tensorboard"
178
+ ],
179
+ "grad_accumulation_steps": 1,
180
+ "run_dir": "work_dirs",
181
+ "type": "VLAMetric",
182
+ "window_size": 1
183
+ },
184
+ "mixed_precision_dtype": "bf16",
185
+ "sampler": null,
186
+ "tokenizer": {
187
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
188
+ "type": "PretrainedTokenizer"
189
+ },
190
+ "type": "FSDPTrainRunner",
191
+ "warmup_ratio": 0.03,
192
+ "weight_decay": 0.0
193
+ },
194
+ "train_dataloader": {
195
+ "dataset": {
196
+ "datasets": {
197
+ "action_key": "action",
198
+ "action_window_size": 10,
199
+ "data_root_path": "datasets/libero_object_no_noops_lerobotv2.1",
200
+ "statistic_name": "libero_object_no_noops",
201
+ "transforms": [
202
+ {
203
+ "embodiment_id": 2,
204
+ "name_mappings": {
205
+ "actions": [
206
+ "actions"
207
+ ],
208
+ "observation.state": [
209
+ "states"
210
+ ]
211
+ },
212
+ "parquet_keys": [
213
+ "observation.state",
214
+ "timestamp",
215
+ "actions",
216
+ "info",
217
+ "stats",
218
+ "action_masks"
219
+ ],
220
+ "type": "ProcessParquetInputs",
221
+ "video_keys": [
222
+ "observation.images.image",
223
+ "observation.images.wrist_image"
224
+ ]
225
+ },
226
+ {
227
+ "type": "ParquetPrompter"
228
+ },
229
+ {
230
+ "max_len": 600,
231
+ "num_images": 2,
232
+ "tokenizer": {
233
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
234
+ "type": "PretrainedTokenizer"
235
+ },
236
+ "type": "ProcessPromptsWithImage"
237
+ },
238
+ {
239
+ "height": 224,
240
+ "type": "ResizeImages",
241
+ "width": 224
242
+ },
243
+ {
244
+ "means": [
245
+ [
246
+ 123.515625,
247
+ 116.04492188,
248
+ 103.59375
249
+ ],
250
+ [
251
+ 123.515625,
252
+ 116.04492188,
253
+ 103.59375
254
+ ]
255
+ ],
256
+ "stds": [
257
+ [
258
+ 58.27148438,
259
+ 57.02636719,
260
+ 57.27539062
261
+ ],
262
+ [
263
+ 58.27148438,
264
+ 57.02636719,
265
+ 57.27539062
266
+ ]
267
+ ],
268
+ "type": "NormalizeImages"
269
+ },
270
+ {
271
+ "action_dim": 32,
272
+ "action_key": "action",
273
+ "norm_type": "mean_std",
274
+ "state_dim": 64,
275
+ "state_key": "proprio",
276
+ "type": "NormalizeStatesAndActions"
277
+ }
278
+ ],
279
+ "type": "ParquetDataset",
280
+ "use_delta": false,
281
+ "window_start_idx": 0
282
+ },
283
+ "name_mappings": {
284
+ "action": [
285
+ "action"
286
+ ],
287
+ "observation.state": [
288
+ "proprio"
289
+ ]
290
+ },
291
+ "statistic_keys": [
292
+ "observation.state",
293
+ "timestamp",
294
+ "action"
295
+ ],
296
+ "statistic_name": "libero_object_no_noops",
297
+ "type": "DistributedRepeatingDataset"
298
+ },
299
+ "per_device_batch_size": 8,
300
+ "per_device_num_workers": 4
301
+ }
302
+ }
gr00t_eagle_3b_libero_object_full_finetune_bs64/config.yaml ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ eval:
2
+ dataset:
3
+ transforms:
4
+ - embodiment_id: 2
5
+ img_keys:
6
+ - agentview_image
7
+ - robot0_eye_in_hand_image
8
+ type: ProcessLiberoEvalInputs
9
+ - image_resize_strategy: resize-naive
10
+ input_sizes:
11
+ - - 3
12
+ - 224
13
+ - 224
14
+ - - 3
15
+ - 224
16
+ - 224
17
+ means:
18
+ - - 123.515625
19
+ - 116.04492188
20
+ - 103.59375
21
+ - - 123.515625
22
+ - 116.04492188
23
+ - 103.59375
24
+ stds:
25
+ - - 58.27148438
26
+ - 57.02636719
27
+ - 57.27539062
28
+ - - 58.27148438
29
+ - 57.02636719
30
+ - 57.27539062
31
+ type: TransformImage
32
+ - max_len: 600
33
+ num_images: 2
34
+ tokenizer:
35
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
36
+ type: PretrainedTokenizer
37
+ type: ProcessPromptsWithImage
38
+ - gripper_key: robot0_gripper_qpos
39
+ norm_type: mean_std
40
+ out_key: states
41
+ pos_key: robot0_eef_pos
42
+ quat_key: robot0_eef_quat
43
+ state_dim: 64
44
+ type: LiberoProprioFromInputs
45
+ type: LiberoParquetEvalDataset
46
+ denormalize_action:
47
+ norm_type: mean_std
48
+ type: DenormalizeLiberoAction
49
+ eval_chunk_size: 10
50
+ model_family: pi0
51
+ num_steps_wait: 10
52
+ num_trials_per_task: 50
53
+ resize_size: 224
54
+ seed: 7
55
+ task_suite_name: libero_object
56
+ type: LiberoEvalRunner
57
+ inference_model:
58
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
59
+ type: LlavaVLA
60
+ vla_head:
61
+ action_dim: 32
62
+ diffusion_model_cfg:
63
+ attention_head_dim: 48
64
+ cross_attention_dim: 2048
65
+ dropout: 0.2
66
+ final_dropout: true
67
+ interleave_self_attention: true
68
+ norm_type: ada_norm
69
+ num_attention_heads: 32
70
+ num_layers: 16
71
+ output_dim: 1024
72
+ positional_embeddings: null
73
+ hidden_size: 1024
74
+ input_embedding_dim: 1536
75
+ num_heads: 4
76
+ num_inference_timesteps: 4
77
+ num_layers: 1
78
+ ori_action_dim: 7
79
+ state_dim: 64
80
+ traj_length: 10
81
+ type: FlowMatchingInferenceHead
82
+ vlm_backbone:
83
+ type: EagleInferenceBackbone
84
+ vlm_path: fluxvla/models/third_party_models/eagle2_hg_model
85
+ model:
86
+ freeze_projector: false
87
+ freeze_vlm_backbone: false
88
+ name_mapping:
89
+ vla_head: action_head
90
+ vlm_backbone.vlm: backbone.eagle_model
91
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
92
+ type: LlavaVLA
93
+ vla_head:
94
+ action_dim: 32
95
+ hidden_size: 1024
96
+ input_embedding_dim: 1536
97
+ num_heads: 4
98
+ num_inference_timesteps: 4
99
+ num_layers: 1
100
+ ori_action_dim: 7
101
+ state_dim: 64
102
+ traj_length: 10
103
+ type: FlowMatchingHead
104
+ vlm_backbone:
105
+ type: EagleBackbone
106
+ vlm_path: fluxvla/models/third_party_models/eagle2_hg_model
107
+ runner:
108
+ change_key_name: false
109
+ collator:
110
+ keys:
111
+ - states
112
+ - observation.eepose
113
+ - timestamp
114
+ - images
115
+ - img_masks
116
+ - lang_tokens
117
+ - lang_masks
118
+ - actions
119
+ - action_masks
120
+ - embodiment_ids
121
+ meta_keys:
122
+ - task_description
123
+ - prompt
124
+ - info
125
+ - stats
126
+ type: DictCollator
127
+ enable_gradient_checkpointing: false
128
+ enable_mixed_precision_training: true
129
+ learning_rate: 1.5e-05
130
+ lr_scheduler_type: linear-warmup+cosine-decay
131
+ max_epochs: 18
132
+ max_grad_norm: 1.0
133
+ metric:
134
+ active_trackers:
135
+ - jsonl
136
+ - tensorboard
137
+ grad_accumulation_steps: 1
138
+ run_dir: work_dirs
139
+ type: VLAMetric
140
+ window_size: 1
141
+ mixed_precision_dtype: bf16
142
+ sampler: null
143
+ tokenizer:
144
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
145
+ type: PretrainedTokenizer
146
+ type: FSDPTrainRunner
147
+ warmup_ratio: 0.03
148
+ weight_decay: 0.0
149
+ train_dataloader:
150
+ dataset:
151
+ datasets:
152
+ action_key: action
153
+ action_window_size: 10
154
+ data_root_path: datasets/libero_object_no_noops_lerobotv2.1
155
+ statistic_name: libero_object_no_noops
156
+ transforms:
157
+ - embodiment_id: 2
158
+ name_mappings:
159
+ actions:
160
+ - actions
161
+ observation.state:
162
+ - states
163
+ parquet_keys:
164
+ - observation.state
165
+ - timestamp
166
+ - actions
167
+ - info
168
+ - stats
169
+ - action_masks
170
+ type: ProcessParquetInputs
171
+ video_keys:
172
+ - observation.images.image
173
+ - observation.images.wrist_image
174
+ - type: ParquetPrompter
175
+ - max_len: 600
176
+ num_images: 2
177
+ tokenizer:
178
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
179
+ type: PretrainedTokenizer
180
+ type: ProcessPromptsWithImage
181
+ - height: 224
182
+ type: ResizeImages
183
+ width: 224
184
+ - means:
185
+ - - 123.515625
186
+ - 116.04492188
187
+ - 103.59375
188
+ - - 123.515625
189
+ - 116.04492188
190
+ - 103.59375
191
+ stds:
192
+ - - 58.27148438
193
+ - 57.02636719
194
+ - 57.27539062
195
+ - - 58.27148438
196
+ - 57.02636719
197
+ - 57.27539062
198
+ type: NormalizeImages
199
+ - action_dim: 32
200
+ action_key: action
201
+ norm_type: mean_std
202
+ state_dim: 64
203
+ state_key: proprio
204
+ type: NormalizeStatesAndActions
205
+ type: ParquetDataset
206
+ use_delta: false
207
+ window_start_idx: 0
208
+ name_mappings:
209
+ action:
210
+ - action
211
+ observation.state:
212
+ - proprio
213
+ statistic_keys:
214
+ - observation.state
215
+ - timestamp
216
+ - action
217
+ statistic_name: libero_object_no_noops
218
+ type: DistributedRepeatingDataset
219
+ per_device_batch_size: 8
220
+ per_device_num_workers: 4
gr00t_eagle_3b_libero_object_full_finetune_bs64/dataset_statistics.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_object_no_noops": {
3
+ "proprio": {
4
+ "mean": [
5
+ -0.029990377887890714,
6
+ -0.007947119348036638,
7
+ 0.20293400450543442,
8
+ 3.108609864126749,
9
+ -0.2140478258736818,
10
+ -0.11307033080181891,
11
+ 0.02938040086729137,
12
+ -0.03055662046031239
13
+ ],
14
+ "std": [
15
+ 0.023670072817660013,
16
+ 0.06225550550101929,
17
+ 0.027602195887468282,
18
+ 0.030705662709939595,
19
+ 0.11858388544011475,
20
+ 0.0732862116780689,
21
+ 0.0033820150919409114,
22
+ 0.003251806898346789
23
+ ],
24
+ "min": [
25
+ -0.1765444278717041,
26
+ -0.29457300901412964,
27
+ 0.008128180168569088,
28
+ 2.2890501022338867,
29
+ -1.883241891860962,
30
+ -1.0600427389144897,
31
+ 0.0006495157140307128,
32
+ -0.041782498359680176
33
+ ],
34
+ "max": [
35
+ 0.14580604434013367,
36
+ 0.33216384053230286,
37
+ 0.3857804834842682,
38
+ 3.4003844261169434,
39
+ 0.7954911589622498,
40
+ 0.6642207503318787,
41
+ 0.04104341194033623,
42
+ -0.00018117300351150334
43
+ ],
44
+ "q01": null,
45
+ "q99": null
46
+ },
47
+ "timestamp": {
48
+ "mean": [
49
+ 3.721695479517497
50
+ ],
51
+ "std": [
52
+ 2.237081841546431
53
+ ],
54
+ "min": [
55
+ 0.0
56
+ ],
57
+ "max": [
58
+ 12.65
59
+ ],
60
+ "q01": null,
61
+ "q99": null
62
+ },
63
+ "action": {
64
+ "mean": [
65
+ 0.07096490746267721,
66
+ 0.13498889685796536,
67
+ -0.046013733641776924,
68
+ 0.0012352044345171392,
69
+ 0.006998803721298765,
70
+ -0.015027527802288103,
71
+ 0.46428998075465666
72
+ ],
73
+ "std": [
74
+ 0.10133946158044306,
75
+ 0.165716399861371,
76
+ 0.16914353294024564,
77
+ 0.009240558533809633,
78
+ 0.018657116474914717,
79
+ 0.015913625946349673,
80
+ 0.18849963395480163
81
+ ],
82
+ "min": [
83
+ -0.8839285969734192,
84
+ -0.9375,
85
+ -0.9375,
86
+ -0.15000000596046448,
87
+ -0.29035714268684387,
88
+ -0.32892856001853943,
89
+ 0.0
90
+ ],
91
+ "max": [
92
+ 0.9375,
93
+ 0.8919642567634583,
94
+ 0.9375,
95
+ 0.17678570747375488,
96
+ 0.35035714507102966,
97
+ 0.1810714304447174,
98
+ 1.0
99
+ ],
100
+ "q01": null,
101
+ "q99": null
102
+ }
103
+ }
104
+ }
gr00t_eagle_3b_libero_object_full_finetune_bs64/gr00t_eagle_3b_libero_object_full_finetune_2026_05_14_02_40_05.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_object_full_finetune_bs64/run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": "{'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_object_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_object', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_object_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_object', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_object_full_finetune_2026_05_14_02_40_05'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_object', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_eagle_3b_libero_object_full_finetune_2026_05_14_02_40_05'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_object_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_object_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_object_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_object', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_object_full_finetune_2026_05_14_02_40_05'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_object', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_object_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_object', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_object', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_eagle_3b_libero_object_full_finetune_2026_05_14_02_40_05"}
gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151671,
4
+ "</interval>": 151679,
5
+ "</quad>": 151675,
6
+ "</ref>": 151677,
7
+ "</think>": 151668,
8
+ "</tool_call>": 151658,
9
+ "</tool_response>": 151666,
10
+ "<IMG_CONTEXT>": 151669,
11
+ "<box>": 151672,
12
+ "<img>": 151670,
13
+ "<interval>": 151678,
14
+ "<quad>": 151674,
15
+ "<ref>": 151676,
16
+ "<think>": 151667,
17
+ "<tool_call>": 151657,
18
+ "<tool_response>": 151665,
19
+ "<|box_end|>": 151649,
20
+ "<|box_start|>": 151648,
21
+ "<|endoftext|>": 151643,
22
+ "<|file_sep|>": 151664,
23
+ "<|fim_middle|>": 151660,
24
+ "<|fim_pad|>": 151662,
25
+ "<|fim_prefix|>": 151659,
26
+ "<|fim_suffix|>": 151661,
27
+ "<|im_end|>": 151645,
28
+ "<|im_start|>": 151644,
29
+ "<|image_pad|>": 151655,
30
+ "<|object_ref_end|>": 151647,
31
+ "<|object_ref_start|>": 151646,
32
+ "<|quad_end|>": 151651,
33
+ "<|quad_start|>": 151650,
34
+ "<|repo_name|>": 151663,
35
+ "<|video_pad|>": 151656,
36
+ "<|vision_end|>": 151653,
37
+ "<|vision_pad|>": 151654,
38
+ "<|vision_start|>": 151652
39
+ }
gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<IMG_CONTEXT>",
17
+ "<img>",
18
+ "</img>",
19
+ "<box>",
20
+ "</box>",
21
+ "<quad>",
22
+ "</quad>",
23
+ "<ref>",
24
+ "</ref>",
25
+ "<interval>",
26
+ "</interval>"
27
+ ],
28
+ "eos_token": {
29
+ "content": "<|im_end|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<IMG_CONTEXT>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<img>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</img>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<quad>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</quad>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<ref>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "</ref>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<interval>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "151679": {
295
+ "content": "</interval>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ }
302
+ },
303
+ "additional_special_tokens": [
304
+ "<|im_start|>",
305
+ "<|im_end|>",
306
+ "<|object_ref_start|>",
307
+ "<|object_ref_end|>",
308
+ "<|box_start|>",
309
+ "<|box_end|>",
310
+ "<|quad_start|>",
311
+ "<|quad_end|>",
312
+ "<|vision_start|>",
313
+ "<|vision_end|>",
314
+ "<|vision_pad|>",
315
+ "<|image_pad|>",
316
+ "<|video_pad|>",
317
+ "<IMG_CONTEXT>",
318
+ "<img>",
319
+ "</img>",
320
+ "<box>",
321
+ "</box>",
322
+ "<quad>",
323
+ "</quad>",
324
+ "<ref>",
325
+ "</ref>",
326
+ "<interval>",
327
+ "</interval>"
328
+ ],
329
+ "auto_map": {
330
+ "AutoProcessor": "processing_eagle2_5_vl.Eagle2_5_VLProcessor"
331
+ },
332
+ "bos_token": null,
333
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
334
+ "clean_up_tokenization_spaces": false,
335
+ "eos_token": "<|im_end|>",
336
+ "errors": "replace",
337
+ "extra_special_tokens": {},
338
+ "model_max_length": 16384,
339
+ "pad_token": "<|endoftext|>",
340
+ "processor_class": "Eagle2_5_VLProcessor",
341
+ "split_special_tokens": false,
342
+ "tokenizer_class": "Qwen2Tokenizer",
343
+ "unk_token": null
344
+ }
gr00t_eagle_3b_libero_object_full_finetune_bs64/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_object_full_finetune_bs64/vlm_backbone_config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation": "flash_attention_2",
3
+ "architectures": [
4
+ "Eagle2_5_VLForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_eagle2_5_vl.Eagle2_5_VLConfig",
8
+ "AutoModel": "modeling_eagle2_5_vl.Eagle2_5_VLForConditionalGeneration"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "dynamic_image_size": true,
12
+ "force_image_size": 224,
13
+ "image_token_index": 151669,
14
+ "initializer_range": 0.02,
15
+ "loss_version": "efficient_v2_cp_head",
16
+ "max_dynamic_tiles": 12,
17
+ "min_dynamic_tiles": 1,
18
+ "mlp_checkpoint": false,
19
+ "mlp_connector_layers": 1,
20
+ "model_type": "eagle_2_5_vl",
21
+ "output_attentions": false,
22
+ "pad2square": false,
23
+ "select_layer": -1,
24
+ "template": "qwen3-chat",
25
+ "text_config": {
26
+ "_name_or_path": "Qwen/Qwen3-1.7B",
27
+ "architectures": [
28
+ "Qwen3ForCausalLM"
29
+ ],
30
+ "attention_bias": false,
31
+ "attention_dropout": 0,
32
+ "bos_token_id": 151643,
33
+ "eos_token_id": 151645,
34
+ "head_dim": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 2048,
37
+ "initializer_range": 0.02,
38
+ "intermediate_size": 6144,
39
+ "layer_types": [
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention"
68
+ ],
69
+ "max_position_embeddings": 40960,
70
+ "max_window_layers": 28,
71
+ "model_type": "qwen3",
72
+ "num_attention_heads": 16,
73
+ "num_hidden_layers": 12,
74
+ "num_key_value_heads": 8,
75
+ "rms_norm_eps": 1e-06,
76
+ "rope_scaling": null,
77
+ "rope_theta": 1000000,
78
+ "sliding_window": null,
79
+ "tie_word_embeddings": true,
80
+ "torch_dtype": "bfloat16",
81
+ "use_cache": false,
82
+ "use_sliding_window": false,
83
+ "vocab_size": 151680
84
+ },
85
+ "tie_word_embeddings": true,
86
+ "torch_dtype": "bfloat16",
87
+ "transformers_version": null,
88
+ "use_backbone_lora": 0,
89
+ "use_llm_lora": 0,
90
+ "use_pixel_shuffle": false,
91
+ "use_thumbnail": true,
92
+ "vision_config": {
93
+ "attention_dropout": 0,
94
+ "hidden_act": "gelu_pytorch_tanh",
95
+ "hidden_size": 1152,
96
+ "image_size": 224,
97
+ "intermediate_size": 4304,
98
+ "layer_norm_eps": 1e-06,
99
+ "model_type": "siglip_vision_model",
100
+ "num_attention_heads": 16,
101
+ "num_channels": 3,
102
+ "num_hidden_layers": 27,
103
+ "patch_size": 14,
104
+ "torch_dtype": "bfloat16"
105
+ }
106
+ }
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/checkpoints/step-014904-epoch-18-loss=0.0780.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a624cec26c0a9d2179f84c668cd2a4eb34f9f5b34e78b67c4d7c5a93676671e
3
+ size 10896783888
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/config.json ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval": {
3
+ "dataset": {
4
+ "transforms": [
5
+ {
6
+ "embodiment_id": 2,
7
+ "img_keys": [
8
+ "agentview_image",
9
+ "robot0_eye_in_hand_image"
10
+ ],
11
+ "type": "ProcessLiberoEvalInputs"
12
+ },
13
+ {
14
+ "image_resize_strategy": "resize-naive",
15
+ "input_sizes": [
16
+ [
17
+ 3,
18
+ 224,
19
+ 224
20
+ ],
21
+ [
22
+ 3,
23
+ 224,
24
+ 224
25
+ ]
26
+ ],
27
+ "means": [
28
+ [
29
+ 123.515625,
30
+ 116.04492188,
31
+ 103.59375
32
+ ],
33
+ [
34
+ 123.515625,
35
+ 116.04492188,
36
+ 103.59375
37
+ ]
38
+ ],
39
+ "stds": [
40
+ [
41
+ 58.27148438,
42
+ 57.02636719,
43
+ 57.27539062
44
+ ],
45
+ [
46
+ 58.27148438,
47
+ 57.02636719,
48
+ 57.27539062
49
+ ]
50
+ ],
51
+ "type": "TransformImage"
52
+ },
53
+ {
54
+ "max_len": 600,
55
+ "num_images": 2,
56
+ "tokenizer": {
57
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
58
+ "type": "PretrainedTokenizer"
59
+ },
60
+ "type": "ProcessPromptsWithImage"
61
+ },
62
+ {
63
+ "gripper_key": "robot0_gripper_qpos",
64
+ "norm_type": "mean_std",
65
+ "out_key": "states",
66
+ "pos_key": "robot0_eef_pos",
67
+ "quat_key": "robot0_eef_quat",
68
+ "state_dim": 64,
69
+ "type": "LiberoProprioFromInputs"
70
+ }
71
+ ],
72
+ "type": "LiberoParquetEvalDataset"
73
+ },
74
+ "denormalize_action": {
75
+ "norm_type": "mean_std",
76
+ "type": "DenormalizeLiberoAction"
77
+ },
78
+ "eval_chunk_size": 10,
79
+ "model_family": "pi0",
80
+ "num_steps_wait": 10,
81
+ "num_trials_per_task": 50,
82
+ "resize_size": 224,
83
+ "seed": 7,
84
+ "task_suite_name": "libero_spatial",
85
+ "type": "LiberoEvalRunner"
86
+ },
87
+ "inference_model": {
88
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
89
+ "type": "LlavaVLA",
90
+ "vla_head": {
91
+ "action_dim": 32,
92
+ "diffusion_model_cfg": {
93
+ "attention_head_dim": 48,
94
+ "cross_attention_dim": 2048,
95
+ "dropout": 0.2,
96
+ "final_dropout": true,
97
+ "interleave_self_attention": true,
98
+ "norm_type": "ada_norm",
99
+ "num_attention_heads": 32,
100
+ "num_layers": 16,
101
+ "output_dim": 1024,
102
+ "positional_embeddings": null
103
+ },
104
+ "hidden_size": 1024,
105
+ "input_embedding_dim": 1536,
106
+ "num_heads": 4,
107
+ "num_inference_timesteps": 4,
108
+ "num_layers": 1,
109
+ "ori_action_dim": 7,
110
+ "state_dim": 64,
111
+ "traj_length": 10,
112
+ "type": "FlowMatchingInferenceHead"
113
+ },
114
+ "vlm_backbone": {
115
+ "type": "EagleInferenceBackbone",
116
+ "vlm_path": "fluxvla/models/third_party_models/eagle2_hg_model"
117
+ }
118
+ },
119
+ "model": {
120
+ "freeze_projector": false,
121
+ "freeze_vlm_backbone": false,
122
+ "name_mapping": {
123
+ "vla_head": "action_head",
124
+ "vlm_backbone.vlm": "backbone.eagle_model"
125
+ },
126
+ "pretrained_name_or_path": "./checkpoints/GR00T-N1.5-3B",
127
+ "type": "LlavaVLA",
128
+ "vla_head": {
129
+ "action_dim": 32,
130
+ "hidden_size": 1024,
131
+ "input_embedding_dim": 1536,
132
+ "num_heads": 4,
133
+ "num_inference_timesteps": 4,
134
+ "num_layers": 1,
135
+ "ori_action_dim": 7,
136
+ "state_dim": 64,
137
+ "traj_length": 10,
138
+ "type": "FlowMatchingHead"
139
+ },
140
+ "vlm_backbone": {
141
+ "type": "EagleBackbone",
142
+ "vlm_path": "fluxvla/models/third_party_models/eagle2_hg_model"
143
+ }
144
+ },
145
+ "runner": {
146
+ "change_key_name": false,
147
+ "collator": {
148
+ "keys": [
149
+ "states",
150
+ "observation.eepose",
151
+ "timestamp",
152
+ "images",
153
+ "img_masks",
154
+ "lang_tokens",
155
+ "lang_masks",
156
+ "actions",
157
+ "action_masks",
158
+ "embodiment_ids"
159
+ ],
160
+ "meta_keys": [
161
+ "task_description",
162
+ "prompt",
163
+ "info",
164
+ "stats"
165
+ ],
166
+ "type": "DictCollator"
167
+ },
168
+ "enable_gradient_checkpointing": false,
169
+ "enable_mixed_precision_training": true,
170
+ "learning_rate": 1.5e-05,
171
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
172
+ "max_epochs": 18,
173
+ "max_grad_norm": 1.0,
174
+ "metric": {
175
+ "active_trackers": [
176
+ "jsonl",
177
+ "tensorboard"
178
+ ],
179
+ "grad_accumulation_steps": 1,
180
+ "run_dir": "work_dirs",
181
+ "type": "VLAMetric",
182
+ "window_size": 1
183
+ },
184
+ "mixed_precision_dtype": "bf16",
185
+ "sampler": null,
186
+ "tokenizer": {
187
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
188
+ "type": "PretrainedTokenizer"
189
+ },
190
+ "type": "FSDPTrainRunner",
191
+ "warmup_ratio": 0.03,
192
+ "weight_decay": 0.0
193
+ },
194
+ "train_dataloader": {
195
+ "dataset": {
196
+ "datasets": {
197
+ "action_key": "action",
198
+ "action_window_size": 10,
199
+ "data_root_path": "datasets/libero_spatial_no_noops_lerobotv2.1",
200
+ "statistic_name": "libero_spatial_no_noops",
201
+ "transforms": [
202
+ {
203
+ "embodiment_id": 2,
204
+ "name_mappings": {
205
+ "actions": [
206
+ "actions"
207
+ ],
208
+ "observation.state": [
209
+ "states"
210
+ ]
211
+ },
212
+ "parquet_keys": [
213
+ "observation.state",
214
+ "timestamp",
215
+ "actions",
216
+ "info",
217
+ "stats",
218
+ "action_masks"
219
+ ],
220
+ "type": "ProcessParquetInputs",
221
+ "video_keys": [
222
+ "observation.images.image",
223
+ "observation.images.wrist_image"
224
+ ]
225
+ },
226
+ {
227
+ "type": "ParquetPrompter"
228
+ },
229
+ {
230
+ "max_len": 600,
231
+ "num_images": 2,
232
+ "tokenizer": {
233
+ "model_path": "fluxvla/models/third_party_models/eagle2_hg_model",
234
+ "type": "PretrainedTokenizer"
235
+ },
236
+ "type": "ProcessPromptsWithImage"
237
+ },
238
+ {
239
+ "height": 224,
240
+ "type": "ResizeImages",
241
+ "width": 224
242
+ },
243
+ {
244
+ "means": [
245
+ [
246
+ 123.515625,
247
+ 116.04492188,
248
+ 103.59375
249
+ ],
250
+ [
251
+ 123.515625,
252
+ 116.04492188,
253
+ 103.59375
254
+ ]
255
+ ],
256
+ "stds": [
257
+ [
258
+ 58.27148438,
259
+ 57.02636719,
260
+ 57.27539062
261
+ ],
262
+ [
263
+ 58.27148438,
264
+ 57.02636719,
265
+ 57.27539062
266
+ ]
267
+ ],
268
+ "type": "NormalizeImages"
269
+ },
270
+ {
271
+ "action_dim": 32,
272
+ "action_key": "action",
273
+ "norm_type": "mean_std",
274
+ "state_dim": 64,
275
+ "state_key": "proprio",
276
+ "type": "NormalizeStatesAndActions"
277
+ }
278
+ ],
279
+ "type": "ParquetDataset",
280
+ "use_delta": false,
281
+ "window_start_idx": 0
282
+ },
283
+ "name_mappings": {
284
+ "action": [
285
+ "action"
286
+ ],
287
+ "observation.state": [
288
+ "proprio"
289
+ ]
290
+ },
291
+ "statistic_keys": [
292
+ "observation.state",
293
+ "timestamp",
294
+ "action"
295
+ ],
296
+ "statistic_name": "libero_spatial_no_noops",
297
+ "type": "DistributedRepeatingDataset"
298
+ },
299
+ "per_device_batch_size": 8,
300
+ "per_device_num_workers": 4
301
+ }
302
+ }
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/config.yaml ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ eval:
2
+ dataset:
3
+ transforms:
4
+ - embodiment_id: 2
5
+ img_keys:
6
+ - agentview_image
7
+ - robot0_eye_in_hand_image
8
+ type: ProcessLiberoEvalInputs
9
+ - image_resize_strategy: resize-naive
10
+ input_sizes:
11
+ - - 3
12
+ - 224
13
+ - 224
14
+ - - 3
15
+ - 224
16
+ - 224
17
+ means:
18
+ - - 123.515625
19
+ - 116.04492188
20
+ - 103.59375
21
+ - - 123.515625
22
+ - 116.04492188
23
+ - 103.59375
24
+ stds:
25
+ - - 58.27148438
26
+ - 57.02636719
27
+ - 57.27539062
28
+ - - 58.27148438
29
+ - 57.02636719
30
+ - 57.27539062
31
+ type: TransformImage
32
+ - max_len: 600
33
+ num_images: 2
34
+ tokenizer:
35
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
36
+ type: PretrainedTokenizer
37
+ type: ProcessPromptsWithImage
38
+ - gripper_key: robot0_gripper_qpos
39
+ norm_type: mean_std
40
+ out_key: states
41
+ pos_key: robot0_eef_pos
42
+ quat_key: robot0_eef_quat
43
+ state_dim: 64
44
+ type: LiberoProprioFromInputs
45
+ type: LiberoParquetEvalDataset
46
+ denormalize_action:
47
+ norm_type: mean_std
48
+ type: DenormalizeLiberoAction
49
+ eval_chunk_size: 10
50
+ model_family: pi0
51
+ num_steps_wait: 10
52
+ num_trials_per_task: 50
53
+ resize_size: 224
54
+ seed: 7
55
+ task_suite_name: libero_spatial
56
+ type: LiberoEvalRunner
57
+ inference_model:
58
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
59
+ type: LlavaVLA
60
+ vla_head:
61
+ action_dim: 32
62
+ diffusion_model_cfg:
63
+ attention_head_dim: 48
64
+ cross_attention_dim: 2048
65
+ dropout: 0.2
66
+ final_dropout: true
67
+ interleave_self_attention: true
68
+ norm_type: ada_norm
69
+ num_attention_heads: 32
70
+ num_layers: 16
71
+ output_dim: 1024
72
+ positional_embeddings: null
73
+ hidden_size: 1024
74
+ input_embedding_dim: 1536
75
+ num_heads: 4
76
+ num_inference_timesteps: 4
77
+ num_layers: 1
78
+ ori_action_dim: 7
79
+ state_dim: 64
80
+ traj_length: 10
81
+ type: FlowMatchingInferenceHead
82
+ vlm_backbone:
83
+ type: EagleInferenceBackbone
84
+ vlm_path: fluxvla/models/third_party_models/eagle2_hg_model
85
+ model:
86
+ freeze_projector: false
87
+ freeze_vlm_backbone: false
88
+ name_mapping:
89
+ vla_head: action_head
90
+ vlm_backbone.vlm: backbone.eagle_model
91
+ pretrained_name_or_path: ./checkpoints/GR00T-N1.5-3B
92
+ type: LlavaVLA
93
+ vla_head:
94
+ action_dim: 32
95
+ hidden_size: 1024
96
+ input_embedding_dim: 1536
97
+ num_heads: 4
98
+ num_inference_timesteps: 4
99
+ num_layers: 1
100
+ ori_action_dim: 7
101
+ state_dim: 64
102
+ traj_length: 10
103
+ type: FlowMatchingHead
104
+ vlm_backbone:
105
+ type: EagleBackbone
106
+ vlm_path: fluxvla/models/third_party_models/eagle2_hg_model
107
+ runner:
108
+ change_key_name: false
109
+ collator:
110
+ keys:
111
+ - states
112
+ - observation.eepose
113
+ - timestamp
114
+ - images
115
+ - img_masks
116
+ - lang_tokens
117
+ - lang_masks
118
+ - actions
119
+ - action_masks
120
+ - embodiment_ids
121
+ meta_keys:
122
+ - task_description
123
+ - prompt
124
+ - info
125
+ - stats
126
+ type: DictCollator
127
+ enable_gradient_checkpointing: false
128
+ enable_mixed_precision_training: true
129
+ learning_rate: 1.5e-05
130
+ lr_scheduler_type: linear-warmup+cosine-decay
131
+ max_epochs: 18
132
+ max_grad_norm: 1.0
133
+ metric:
134
+ active_trackers:
135
+ - jsonl
136
+ - tensorboard
137
+ grad_accumulation_steps: 1
138
+ run_dir: work_dirs
139
+ type: VLAMetric
140
+ window_size: 1
141
+ mixed_precision_dtype: bf16
142
+ sampler: null
143
+ tokenizer:
144
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
145
+ type: PretrainedTokenizer
146
+ type: FSDPTrainRunner
147
+ warmup_ratio: 0.03
148
+ weight_decay: 0.0
149
+ train_dataloader:
150
+ dataset:
151
+ datasets:
152
+ action_key: action
153
+ action_window_size: 10
154
+ data_root_path: datasets/libero_spatial_no_noops_lerobotv2.1
155
+ statistic_name: libero_spatial_no_noops
156
+ transforms:
157
+ - embodiment_id: 2
158
+ name_mappings:
159
+ actions:
160
+ - actions
161
+ observation.state:
162
+ - states
163
+ parquet_keys:
164
+ - observation.state
165
+ - timestamp
166
+ - actions
167
+ - info
168
+ - stats
169
+ - action_masks
170
+ type: ProcessParquetInputs
171
+ video_keys:
172
+ - observation.images.image
173
+ - observation.images.wrist_image
174
+ - type: ParquetPrompter
175
+ - max_len: 600
176
+ num_images: 2
177
+ tokenizer:
178
+ model_path: fluxvla/models/third_party_models/eagle2_hg_model
179
+ type: PretrainedTokenizer
180
+ type: ProcessPromptsWithImage
181
+ - height: 224
182
+ type: ResizeImages
183
+ width: 224
184
+ - means:
185
+ - - 123.515625
186
+ - 116.04492188
187
+ - 103.59375
188
+ - - 123.515625
189
+ - 116.04492188
190
+ - 103.59375
191
+ stds:
192
+ - - 58.27148438
193
+ - 57.02636719
194
+ - 57.27539062
195
+ - - 58.27148438
196
+ - 57.02636719
197
+ - 57.27539062
198
+ type: NormalizeImages
199
+ - action_dim: 32
200
+ action_key: action
201
+ norm_type: mean_std
202
+ state_dim: 64
203
+ state_key: proprio
204
+ type: NormalizeStatesAndActions
205
+ type: ParquetDataset
206
+ use_delta: false
207
+ window_start_idx: 0
208
+ name_mappings:
209
+ action:
210
+ - action
211
+ observation.state:
212
+ - proprio
213
+ statistic_keys:
214
+ - observation.state
215
+ - timestamp
216
+ - action
217
+ statistic_name: libero_spatial_no_noops
218
+ type: DistributedRepeatingDataset
219
+ per_device_batch_size: 8
220
+ per_device_num_workers: 4
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/dataset_statistics.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "libero_spatial_no_noops": {
3
+ "proprio": {
4
+ "mean": [
5
+ -0.024462566693947342,
6
+ 0.10653030478388664,
7
+ 1.0580495716300307,
8
+ 3.062855007870368,
9
+ -0.10464045916348884,
10
+ 0.08307320236969534,
11
+ 0.019954609054627596,
12
+ -0.02016269208612657
13
+ ],
14
+ "std": [
15
+ 0.03894316411915835,
16
+ 0.04873628070932603,
17
+ 0.03692094784082842,
18
+ 0.0369502396792774,
19
+ 0.14538513627309543,
20
+ 0.07695788947742314,
21
+ 0.006102641532497049,
22
+ 0.006049884044502419
23
+ ],
24
+ "min": [
25
+ -0.3095473051071167,
26
+ -0.29250794649124146,
27
+ 0.9095591306686401,
28
+ 2.497488260269165,
29
+ -1.8006486892700195,
30
+ -0.7207611203193665,
31
+ -0.0004703797458205372,
32
+ -0.041536275297403336
33
+ ],
34
+ "max": [
35
+ 0.1759040206670761,
36
+ 0.3904820382595062,
37
+ 1.3290715217590332,
38
+ 3.4566118717193604,
39
+ 1.2268599271774292,
40
+ 1.0429412126541138,
41
+ 0.041053611785173416,
42
+ 0.000775813648942858
43
+ ],
44
+ "q01": null,
45
+ "q99": null
46
+ },
47
+ "timestamp": {
48
+ "mean": [
49
+ 3.1281914291108173
50
+ ],
51
+ "std": [
52
+ 1.9190018719668336
53
+ ],
54
+ "min": [
55
+ 0.0
56
+ ],
57
+ "max": [
58
+ 9.6
59
+ ],
60
+ "q01": null,
61
+ "q99": null
62
+ },
63
+ "action": {
64
+ "mean": [
65
+ 0.15312488430795423,
66
+ 0.13707241597825376,
67
+ -0.15526779033841448,
68
+ -0.005176474488725037,
69
+ -0.011208756940533639,
70
+ -0.02019425420384803,
71
+ 0.4578818200364616
72
+ ],
73
+ "std": [
74
+ 0.15599777645651164,
75
+ 0.13125442554438385,
76
+ 0.19226097301543327,
77
+ 0.014084604392168992,
78
+ 0.02738322007326005,
79
+ 0.021779582921450876,
80
+ 0.18831055119433956
81
+ ],
82
+ "min": [
83
+ -0.9375,
84
+ -0.9375,
85
+ -0.9375,
86
+ -0.1875,
87
+ -0.3675000071525574,
88
+ -0.36000001430511475,
89
+ 0.0
90
+ ],
91
+ "max": [
92
+ 0.9375,
93
+ 0.9375,
94
+ 0.9375,
95
+ 0.1971428543329239,
96
+ 0.33642858266830444,
97
+ 0.375,
98
+ 1.0
99
+ ],
100
+ "q01": null,
101
+ "q99": null
102
+ }
103
+ }
104
+ }
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/gr00t_eagle_3b_libero_spatial_full_finetune_2026_05_14_02_41_01.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/run-metrics.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"hparams": "{'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_spatial_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_spatial_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_spatial_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_spatial', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_spatial_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_spatial_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_spatial_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_spatial', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_spatial_full_finetune_2026_05_14_02_41_01'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_spatial', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_spatial', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'run_id': 'gr00t_eagle_3b_libero_spatial_full_finetune_2026_05_14_02_41_01'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py): {'model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7}, 'freeze_vlm_backbone': False, 'name_mapping': {'vlm_backbone.vlm': 'backbone.eagle_model', 'vla_head': 'action_head'}, 'freeze_projector': False}, 'inference_model': {'type': 'LlavaVLA', 'pretrained_name_or_path': './checkpoints/GR00T-N1.5-3B', 'vlm_backbone': {'type': 'EagleInferenceBackbone', 'vlm_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'vla_head': {'type': 'FlowMatchingInferenceHead', 'state_dim': 64, 'hidden_size': 1024, 'input_embedding_dim': 1536, 'num_layers': 1, 'num_heads': 4, 'num_inference_timesteps': 4, 'traj_length': 10, 'action_dim': 32, 'ori_action_dim': 7, 'diffusion_model_cfg': {'attention_head_dim': 48, 'cross_attention_dim': 2048, 'dropout': 0.2, 'final_dropout': True, 'interleave_self_attention': True, 'norm_type': 'ada_norm', 'num_attention_heads': 32, 'num_layers': 16, 'output_dim': 1024, 'positional_embeddings': None}}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_spatial_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': 'datasets/libero_spatial_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'embodiment_id': 2, 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter'}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 64, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_spatial_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 1.5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sampler': None, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks', 'embodiment_ids'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/gr00t/libero_spatial', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py): {...}, 'run_id': 'gr00t_eagle_3b_libero_spatial_full_finetune_2026_05_14_02_41_01'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': False, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py): {...}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_spatial', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_spatial', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}, 'args': Namespace(config='configs/gr00t/gr00t_eagle_3b_libero_spatial_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/gr00t/libero_spatial', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_spatial', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'embodiment_id': 2, 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'ProcessPromptsWithImage', 'max_len': 600, 'num_images': 2, 'tokenizer': {'type': 'PretrainedTokenizer', 'model_path': 'fluxvla/models/third_party_models/eagle2_hg_model'}}, {'type': 'LiberoProprioFromInputs', 'state_dim': 64, 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std'}}}", "run_id": "gr00t_eagle_3b_libero_spatial_full_finetune_2026_05_14_02_41_01"}
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/added_tokens.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151673,
3
+ "</img>": 151671,
4
+ "</interval>": 151679,
5
+ "</quad>": 151675,
6
+ "</ref>": 151677,
7
+ "</think>": 151668,
8
+ "</tool_call>": 151658,
9
+ "</tool_response>": 151666,
10
+ "<IMG_CONTEXT>": 151669,
11
+ "<box>": 151672,
12
+ "<img>": 151670,
13
+ "<interval>": 151678,
14
+ "<quad>": 151674,
15
+ "<ref>": 151676,
16
+ "<think>": 151667,
17
+ "<tool_call>": 151657,
18
+ "<tool_response>": 151665,
19
+ "<|box_end|>": 151649,
20
+ "<|box_start|>": 151648,
21
+ "<|endoftext|>": 151643,
22
+ "<|file_sep|>": 151664,
23
+ "<|fim_middle|>": 151660,
24
+ "<|fim_pad|>": 151662,
25
+ "<|fim_prefix|>": 151659,
26
+ "<|fim_suffix|>": 151661,
27
+ "<|im_end|>": 151645,
28
+ "<|im_start|>": 151644,
29
+ "<|image_pad|>": 151655,
30
+ "<|object_ref_end|>": 151647,
31
+ "<|object_ref_start|>": 151646,
32
+ "<|quad_end|>": 151651,
33
+ "<|quad_start|>": 151650,
34
+ "<|repo_name|>": 151663,
35
+ "<|video_pad|>": 151656,
36
+ "<|vision_end|>": 151653,
37
+ "<|vision_pad|>": 151654,
38
+ "<|vision_start|>": 151652
39
+ }
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<IMG_CONTEXT>",
17
+ "<img>",
18
+ "</img>",
19
+ "<box>",
20
+ "</box>",
21
+ "<quad>",
22
+ "</quad>",
23
+ "<ref>",
24
+ "</ref>",
25
+ "<interval>",
26
+ "</interval>"
27
+ ],
28
+ "eos_token": {
29
+ "content": "<|im_end|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<IMG_CONTEXT>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "<img>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "</img>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<box>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</box>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<quad>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</quad>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<ref>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "</ref>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<interval>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "151679": {
295
+ "content": "</interval>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ }
302
+ },
303
+ "additional_special_tokens": [
304
+ "<|im_start|>",
305
+ "<|im_end|>",
306
+ "<|object_ref_start|>",
307
+ "<|object_ref_end|>",
308
+ "<|box_start|>",
309
+ "<|box_end|>",
310
+ "<|quad_start|>",
311
+ "<|quad_end|>",
312
+ "<|vision_start|>",
313
+ "<|vision_end|>",
314
+ "<|vision_pad|>",
315
+ "<|image_pad|>",
316
+ "<|video_pad|>",
317
+ "<IMG_CONTEXT>",
318
+ "<img>",
319
+ "</img>",
320
+ "<box>",
321
+ "</box>",
322
+ "<quad>",
323
+ "</quad>",
324
+ "<ref>",
325
+ "</ref>",
326
+ "<interval>",
327
+ "</interval>"
328
+ ],
329
+ "auto_map": {
330
+ "AutoProcessor": "processing_eagle2_5_vl.Eagle2_5_VLProcessor"
331
+ },
332
+ "bos_token": null,
333
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
334
+ "clean_up_tokenization_spaces": false,
335
+ "eos_token": "<|im_end|>",
336
+ "errors": "replace",
337
+ "extra_special_tokens": {},
338
+ "model_max_length": 16384,
339
+ "pad_token": "<|endoftext|>",
340
+ "processor_class": "Eagle2_5_VLProcessor",
341
+ "split_special_tokens": false,
342
+ "tokenizer_class": "Qwen2Tokenizer",
343
+ "unk_token": null
344
+ }
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
gr00t_eagle_3b_libero_spatial_full_finetune_bs64/vlm_backbone_config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation": "flash_attention_2",
3
+ "architectures": [
4
+ "Eagle2_5_VLForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_eagle2_5_vl.Eagle2_5_VLConfig",
8
+ "AutoModel": "modeling_eagle2_5_vl.Eagle2_5_VLForConditionalGeneration"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "dynamic_image_size": true,
12
+ "force_image_size": 224,
13
+ "image_token_index": 151669,
14
+ "initializer_range": 0.02,
15
+ "loss_version": "efficient_v2_cp_head",
16
+ "max_dynamic_tiles": 12,
17
+ "min_dynamic_tiles": 1,
18
+ "mlp_checkpoint": false,
19
+ "mlp_connector_layers": 1,
20
+ "model_type": "eagle_2_5_vl",
21
+ "output_attentions": false,
22
+ "pad2square": false,
23
+ "select_layer": -1,
24
+ "template": "qwen3-chat",
25
+ "text_config": {
26
+ "_name_or_path": "Qwen/Qwen3-1.7B",
27
+ "architectures": [
28
+ "Qwen3ForCausalLM"
29
+ ],
30
+ "attention_bias": false,
31
+ "attention_dropout": 0,
32
+ "bos_token_id": 151643,
33
+ "eos_token_id": 151645,
34
+ "head_dim": 128,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 2048,
37
+ "initializer_range": 0.02,
38
+ "intermediate_size": 6144,
39
+ "layer_types": [
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention"
68
+ ],
69
+ "max_position_embeddings": 40960,
70
+ "max_window_layers": 28,
71
+ "model_type": "qwen3",
72
+ "num_attention_heads": 16,
73
+ "num_hidden_layers": 12,
74
+ "num_key_value_heads": 8,
75
+ "rms_norm_eps": 1e-06,
76
+ "rope_scaling": null,
77
+ "rope_theta": 1000000,
78
+ "sliding_window": null,
79
+ "tie_word_embeddings": true,
80
+ "torch_dtype": "bfloat16",
81
+ "use_cache": false,
82
+ "use_sliding_window": false,
83
+ "vocab_size": 151680
84
+ },
85
+ "tie_word_embeddings": true,
86
+ "torch_dtype": "bfloat16",
87
+ "transformers_version": null,
88
+ "use_backbone_lora": 0,
89
+ "use_llm_lora": 0,
90
+ "use_pixel_shuffle": false,
91
+ "use_thumbnail": true,
92
+ "vision_config": {
93
+ "attention_dropout": 0,
94
+ "hidden_act": "gelu_pytorch_tanh",
95
+ "hidden_size": 1152,
96
+ "image_size": 224,
97
+ "intermediate_size": 4304,
98
+ "layer_norm_eps": 1e-06,
99
+ "model_type": "siglip_vision_model",
100
+ "num_attention_heads": 16,
101
+ "num_channels": 3,
102
+ "num_hidden_layers": 27,
103
+ "patch_size": 14,
104
+ "torch_dtype": "bfloat16"
105
+ }
106
+ }
pi05_paligemma_libero_10_full_finetune_bs64/checkpoints/step-038064-epoch-24-loss=0.0170.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a79542cbb79b75deea3804d91f87f9c2180ff704f0cd1988469ee72655fbe15
3
+ size 7233625688
pi05_paligemma_libero_10_full_finetune_bs64/config.json CHANGED
@@ -225,14 +225,12 @@
225
  "enable_mixed_precision_training": true,
226
  "learning_rate": 5e-05,
227
  "lr_scheduler_type": "linear-warmup+cosine-decay",
228
- "max_epochs": 18,
229
  "max_grad_norm": 1.0,
230
- "max_keep_ckpts": 1,
231
- "max_steps": null,
232
  "metric": {
233
  "active_trackers": [
234
  "jsonl",
235
- "wandb"
236
  ],
237
  "grad_accumulation_steps": 1,
238
  "run_dir": "work_dirs",
@@ -241,6 +239,7 @@
241
  },
242
  "mixed_precision_dtype": "bf16",
243
  "sampler": null,
 
244
  "tokenizer": {
245
  "type": "PaligemmaTokenizer"
246
  },
 
225
  "enable_mixed_precision_training": true,
226
  "learning_rate": 5e-05,
227
  "lr_scheduler_type": "linear-warmup+cosine-decay",
228
+ "max_epochs": 24,
229
  "max_grad_norm": 1.0,
 
 
230
  "metric": {
231
  "active_trackers": [
232
  "jsonl",
233
+ "tensorboard"
234
  ],
235
  "grad_accumulation_steps": 1,
236
  "run_dir": "work_dirs",
 
239
  },
240
  "mixed_precision_dtype": "bf16",
241
  "sampler": null,
242
+ "sharding_strategy": "no-shard",
243
  "tokenizer": {
244
  "type": "PaligemmaTokenizer"
245
  },
pi05_paligemma_libero_10_full_finetune_bs64/config.yaml CHANGED
@@ -181,20 +181,19 @@ runner:
181
  enable_mixed_precision_training: true
182
  learning_rate: 5.0e-05
183
  lr_scheduler_type: linear-warmup+cosine-decay
184
- max_epochs: 18
185
  max_grad_norm: 1.0
186
- max_keep_ckpts: 1
187
- max_steps: null
188
  metric:
189
  active_trackers:
190
  - jsonl
191
- - wandb
192
  grad_accumulation_steps: 1
193
  run_dir: work_dirs
194
  type: VLAMetric
195
  window_size: 1
196
  mixed_precision_dtype: bf16
197
  sampler: null
 
198
  tokenizer:
199
  type: PaligemmaTokenizer
200
  type: FSDPTrainRunner
 
181
  enable_mixed_precision_training: true
182
  learning_rate: 5.0e-05
183
  lr_scheduler_type: linear-warmup+cosine-decay
184
+ max_epochs: 24
185
  max_grad_norm: 1.0
 
 
186
  metric:
187
  active_trackers:
188
  - jsonl
189
+ - tensorboard
190
  grad_accumulation_steps: 1
191
  run_dir: work_dirs
192
  type: VLAMetric
193
  window_size: 1
194
  mixed_precision_dtype: bf16
195
  sampler: null
196
+ sharding_strategy: no-shard
197
  tokenizer:
198
  type: PaligemmaTokenizer
199
  type: FSDPTrainRunner
pi05_paligemma_libero_10_full_finetune_bs64/pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6b55f3eac24c09f0b32008eb10d130f9769ce38528f7b110d62789bbd20826e
3
+ size 10747290
pi05_paligemma_libero_10_full_finetune_bs64/run-metrics.jsonl CHANGED
@@ -1 +1 @@
1
- {"hparams": "{'model': {'type': 'PI05FlowMatching', 'llm_backbone': {'type': 'ConditionGemmaModel', 'adarms_cond_dim': None, 'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 16384, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'use_cache': True, 'vocab_size': 257152}, 'vision_backbone': {'type': 'SigLIPViTBackbone', 'vision_backbone_id': 'siglip_224', 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 224, 'intermediate_size': 4304, 'layer_norm_eps': 1e-06, 'model_type': 'siglip_vision_model', 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'patch_size': 14, 'projection_dim': 2048, 'projector_hidden_act': 'gelu_fast', 'torch_dtype': 'float32', 'vision_use_head': False}}, 'projector': {'type': 'LinearProjector', 'in_dim': 1152, 'out_dim': 2048}, 'proj_width': 1024, 'n_action_steps': 10, 'action_in_proj': {'type': 'LinearProjector', 'in_dim': 32, 'out_dim': 1024}, 'action_out_proj': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 32}, 'time_mlp_in': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'time_mlp_out': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'max_action_dim': 32, 'llm_expert': {'type': 'ConditionGemmaModel', 'attention_bias': False, 'adarms_cond_dim': 1024, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 4096, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'pad_token_id': 0, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'transformers_version': '4.48.1', 'use_adarms': True, 'use_cache': True, 'vocab_size': 257152}, 'freeze_llm_backbone': False, 'freeze_vision_backbone': False, 'pretrained_name_or_path': './checkpoints/pi05_libero/model.safetensors', 'name_mapping': {'llm_backbone': 'paligemma_with_expert.paligemma.model.language_model', 'vision_backbone.vision': 'paligemma_with_expert.paligemma.model.vision_tower', 'projector.projector': 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear', 'llm_expert': 'paligemma_with_expert.gemma_expert.model', 'time_mlp_in.projector': 'time_mlp_in', 'time_mlp_out.projector': 'time_mlp_out', 'action_in_proj.projector': 'action_in_proj', 'action_out_proj.projector': 'action_out_proj', 'llm_backbone.embed_tokens': 'paligemma_with_expert.paligemma.lm_head'}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': './datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter', 'use_conversation': False}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 32, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'sampler': None, 'tokenizer': {'type': 'PaligemmaTokenizer'}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': './work_dirs/pi05_paligemma_libero_10_full_finetune_bs64', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {'model': {'type': 'PI05FlowMatching', 'llm_backbone': {'type': 'ConditionGemmaModel', 'adarms_cond_dim': None, 'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 16384, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'use_cache': True, 'vocab_size': 257152}, 'vision_backbone': {'type': 'SigLIPViTBackbone', 'vision_backbone_id': 'siglip_224', 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 224, 'intermediate_size': 4304, 'layer_norm_eps': 1e-06, 'model_type': 'siglip_vision_model', 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'patch_size': 14, 'projection_dim': 2048, 'projector_hidden_act': 'gelu_fast', 'torch_dtype': 'float32', 'vision_use_head': False}}, 'projector': {'type': 'LinearProjector', 'in_dim': 1152, 'out_dim': 2048}, 'proj_width': 1024, 'n_action_steps': 10, 'action_in_proj': {'type': 'LinearProjector', 'in_dim': 32, 'out_dim': 1024}, 'action_out_proj': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 32}, 'time_mlp_in': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'time_mlp_out': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'max_action_dim': 32, 'llm_expert': {'type': 'ConditionGemmaModel', 'attention_bias': False, 'adarms_cond_dim': 1024, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 4096, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'pad_token_id': 0, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'transformers_version': '4.48.1', 'use_adarms': True, 'use_cache': True, 'vocab_size': 257152}, 'freeze_llm_backbone': False, 'freeze_vision_backbone': False, 'pretrained_name_or_path': './checkpoints/pi05_libero/model.safetensors', 'name_mapping': {'llm_backbone': 'paligemma_with_expert.paligemma.model.language_model', 'vision_backbone.vision': 'paligemma_with_expert.paligemma.model.vision_tower', 'projector.projector': 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear', 'llm_expert': 'paligemma_with_expert.gemma_expert.model', 'time_mlp_in.projector': 'time_mlp_in', 'time_mlp_out.projector': 'time_mlp_out', 'action_in_proj.projector': 'action_in_proj', 'action_out_proj.projector': 'action_out_proj', 'llm_backbone.embed_tokens': 'paligemma_with_expert.paligemma.lm_head'}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': './datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter', 'use_conversation': False}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 32, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'sampler': None, 'tokenizer': {'type': 'PaligemmaTokenizer'}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': './work_dirs/pi05_paligemma_libero_10_full_finetune_bs64', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'run_id': 'pi05_paligemma_libero_10_full_finetune_2026_03_12_06_33_02'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'max_steps': None, 'max_keep_ckpts': 1, 'cfg': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/pi05/pi05_paligemma_libero_10_full_finetune.py', work_dir='./work_dirs/pi05_paligemma_libero_10_full_finetune_bs64', cfg_options={'train_dataloader.per_device_batch_size': 8, 'runner.max_epochs': 18, 'runner.max_steps': None, 'runner.max_keep_ckpts': 1}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'LiberoPromptFromInputs', 'use_conversation': False, 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'LiberoProprioFromInputs', 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'state_dim': 32, 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std', 'action_dim': 7}}}, 'run_id': 'pi05_paligemma_libero_10_full_finetune_2026_03_12_06_33_02'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'max_steps': None, 'max_keep_ckpts': 1, 'cfg': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {'model': {'type': 'PI05FlowMatching', 'llm_backbone': {'type': 'ConditionGemmaModel', 'adarms_cond_dim': None, 'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 16384, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'use_cache': True, 'vocab_size': 257152}, 'vision_backbone': {'type': 'SigLIPViTBackbone', 'vision_backbone_id': 'siglip_224', 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 224, 'intermediate_size': 4304, 'layer_norm_eps': 1e-06, 'model_type': 'siglip_vision_model', 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'patch_size': 14, 'projection_dim': 2048, 'projector_hidden_act': 'gelu_fast', 'torch_dtype': 'float32', 'vision_use_head': False}}, 'projector': {'type': 'LinearProjector', 'in_dim': 1152, 'out_dim': 2048}, 'proj_width': 1024, 'n_action_steps': 10, 'action_in_proj': {'type': 'LinearProjector', 'in_dim': 32, 'out_dim': 1024}, 'action_out_proj': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 32}, 'time_mlp_in': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'time_mlp_out': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'max_action_dim': 32, 'llm_expert': {'type': 'ConditionGemmaModel', 'attention_bias': False, 'adarms_cond_dim': 1024, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 4096, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'pad_token_id': 0, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'transformers_version': '4.48.1', 'use_adarms': True, 'use_cache': True, 'vocab_size': 257152}, 'freeze_llm_backbone': False, 'freeze_vision_backbone': False, 'pretrained_name_or_path': './checkpoints/pi05_libero/model.safetensors', 'name_mapping': {'llm_backbone': 'paligemma_with_expert.paligemma.model.language_model', 'vision_backbone.vision': 'paligemma_with_expert.paligemma.model.vision_tower', 'projector.projector': 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear', 'llm_expert': 'paligemma_with_expert.gemma_expert.model', 'time_mlp_in.projector': 'time_mlp_in', 'time_mlp_out.projector': 'time_mlp_out', 'action_in_proj.projector': 'action_in_proj', 'action_out_proj.projector': 'action_out_proj', 'llm_backbone.embed_tokens': 'paligemma_with_expert.paligemma.lm_head'}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': './datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter', 'use_conversation': False}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 32, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 18, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'sampler': None, 'tokenizer': {'type': 'PaligemmaTokenizer'}, 'metric': {'type': 'VLAMetric', 'active_trackers': ('jsonl', 'wandb'), 'run_dir': './work_dirs/pi05_paligemma_libero_10_full_finetune_bs64', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'run_id': 'pi05_paligemma_libero_10_full_finetune_2026_03_12_06_33_02'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'max_steps': None, 'max_keep_ckpts': 1, 'cfg': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/pi05/pi05_paligemma_libero_10_full_finetune.py', work_dir='./work_dirs/pi05_paligemma_libero_10_full_finetune_bs64', cfg_options={'train_dataloader.per_device_batch_size': 8, 'runner.max_epochs': 18, 'runner.max_steps': None, 'runner.max_keep_ckpts': 1}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'LiberoPromptFromInputs', 'use_conversation': False, 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'LiberoProprioFromInputs', 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'state_dim': 32, 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std', 'action_dim': 7}}}, 'args': Namespace(config='configs/pi05/pi05_paligemma_libero_10_full_finetune.py', work_dir='./work_dirs/pi05_paligemma_libero_10_full_finetune_bs64', cfg_options={'train_dataloader.per_device_batch_size': 8, 'runner.max_epochs': 18, 'runner.max_steps': None, 'runner.max_keep_ckpts': 1}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'LiberoPromptFromInputs', 'use_conversation': False, 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'LiberoProprioFromInputs', 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'state_dim': 32, 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std', 'action_dim': 7}}}", "run_id": "pi05_paligemma_libero_10_full_finetune_2026_03_12_06_33_02"}
 
1
+ {"hparams": "{'model': {'type': 'PI05FlowMatching', 'llm_backbone': {'type': 'ConditionGemmaModel', 'adarms_cond_dim': None, 'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 16384, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'use_cache': True, 'vocab_size': 257152}, 'vision_backbone': {'type': 'SigLIPViTBackbone', 'vision_backbone_id': 'siglip_224', 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 224, 'intermediate_size': 4304, 'layer_norm_eps': 1e-06, 'model_type': 'siglip_vision_model', 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'patch_size': 14, 'projection_dim': 2048, 'projector_hidden_act': 'gelu_fast', 'torch_dtype': 'float32', 'vision_use_head': False}}, 'projector': {'type': 'LinearProjector', 'in_dim': 1152, 'out_dim': 2048}, 'proj_width': 1024, 'n_action_steps': 10, 'action_in_proj': {'type': 'LinearProjector', 'in_dim': 32, 'out_dim': 1024}, 'action_out_proj': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 32}, 'time_mlp_in': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'time_mlp_out': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'max_action_dim': 32, 'llm_expert': {'type': 'ConditionGemmaModel', 'attention_bias': False, 'adarms_cond_dim': 1024, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 4096, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'pad_token_id': 0, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'transformers_version': '4.48.1', 'use_adarms': True, 'use_cache': True, 'vocab_size': 257152}, 'freeze_llm_backbone': False, 'freeze_vision_backbone': False, 'pretrained_name_or_path': './checkpoints/pi05_libero/model.safetensors', 'name_mapping': {'llm_backbone': 'paligemma_with_expert.paligemma.model.language_model', 'vision_backbone.vision': 'paligemma_with_expert.paligemma.model.vision_tower', 'projector.projector': 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear', 'llm_expert': 'paligemma_with_expert.gemma_expert.model', 'time_mlp_in.projector': 'time_mlp_in', 'time_mlp_out.projector': 'time_mlp_out', 'action_in_proj.projector': 'action_in_proj', 'action_out_proj.projector': 'action_out_proj', 'llm_backbone.embed_tokens': 'paligemma_with_expert.paligemma.lm_head'}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': './datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter', 'use_conversation': False}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 32, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sharding_strategy': 'no-shard', 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'sampler': None, 'tokenizer': {'type': 'PaligemmaTokenizer'}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/pi05/libero_10', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {'model': {'type': 'PI05FlowMatching', 'llm_backbone': {'type': 'ConditionGemmaModel', 'adarms_cond_dim': None, 'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 16384, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'use_cache': True, 'vocab_size': 257152}, 'vision_backbone': {'type': 'SigLIPViTBackbone', 'vision_backbone_id': 'siglip_224', 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 224, 'intermediate_size': 4304, 'layer_norm_eps': 1e-06, 'model_type': 'siglip_vision_model', 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'patch_size': 14, 'projection_dim': 2048, 'projector_hidden_act': 'gelu_fast', 'torch_dtype': 'float32', 'vision_use_head': False}}, 'projector': {'type': 'LinearProjector', 'in_dim': 1152, 'out_dim': 2048}, 'proj_width': 1024, 'n_action_steps': 10, 'action_in_proj': {'type': 'LinearProjector', 'in_dim': 32, 'out_dim': 1024}, 'action_out_proj': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 32}, 'time_mlp_in': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'time_mlp_out': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'max_action_dim': 32, 'llm_expert': {'type': 'ConditionGemmaModel', 'attention_bias': False, 'adarms_cond_dim': 1024, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 4096, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'pad_token_id': 0, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'transformers_version': '4.48.1', 'use_adarms': True, 'use_cache': True, 'vocab_size': 257152}, 'freeze_llm_backbone': False, 'freeze_vision_backbone': False, 'pretrained_name_or_path': './checkpoints/pi05_libero/model.safetensors', 'name_mapping': {'llm_backbone': 'paligemma_with_expert.paligemma.model.language_model', 'vision_backbone.vision': 'paligemma_with_expert.paligemma.model.vision_tower', 'projector.projector': 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear', 'llm_expert': 'paligemma_with_expert.gemma_expert.model', 'time_mlp_in.projector': 'time_mlp_in', 'time_mlp_out.projector': 'time_mlp_out', 'action_in_proj.projector': 'action_in_proj', 'action_out_proj.projector': 'action_out_proj', 'llm_backbone.embed_tokens': 'paligemma_with_expert.paligemma.lm_head'}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': './datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter', 'use_conversation': False}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 32, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sharding_strategy': 'no-shard', 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'sampler': None, 'tokenizer': {'type': 'PaligemmaTokenizer'}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/pi05/libero_10', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'run_id': 'pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/pi05/pi05_paligemma_libero_10_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/pi05/libero_10', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'LiberoPromptFromInputs', 'use_conversation': False, 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'LiberoProprioFromInputs', 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'state_dim': 32, 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std', 'action_dim': 7}}}, 'run_id': 'pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {'model': {'type': 'PI05FlowMatching', 'llm_backbone': {'type': 'ConditionGemmaModel', 'adarms_cond_dim': None, 'attention_bias': False, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 16384, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'use_cache': True, 'vocab_size': 257152}, 'vision_backbone': {'type': 'SigLIPViTBackbone', 'vision_backbone_id': 'siglip_224', 'vision_config': {'attention_dropout': 0.0, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_size': 1152, 'image_size': 224, 'intermediate_size': 4304, 'layer_norm_eps': 1e-06, 'model_type': 'siglip_vision_model', 'num_attention_heads': 16, 'num_channels': 3, 'num_hidden_layers': 27, 'patch_size': 14, 'projection_dim': 2048, 'projector_hidden_act': 'gelu_fast', 'torch_dtype': 'float32', 'vision_use_head': False}}, 'projector': {'type': 'LinearProjector', 'in_dim': 1152, 'out_dim': 2048}, 'proj_width': 1024, 'n_action_steps': 10, 'action_in_proj': {'type': 'LinearProjector', 'in_dim': 32, 'out_dim': 1024}, 'action_out_proj': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 32}, 'time_mlp_in': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'time_mlp_out': {'type': 'LinearProjector', 'in_dim': 1024, 'out_dim': 1024}, 'max_action_dim': 32, 'llm_expert': {'type': 'ConditionGemmaModel', 'attention_bias': False, 'adarms_cond_dim': 1024, 'attention_dropout': 0.0, 'bos_token_id': 2, 'eos_token_id': 1, 'head_dim': 256, 'hidden_act': 'gelu_pytorch_tanh', 'hidden_activation': 'gelu_pytorch_tanh', 'hidden_size': 1024, 'initializer_range': 0.02, 'intermediate_size': 4096, 'max_position_embeddings': 8192, 'model_type': 'gemma', 'num_attention_heads': 8, 'num_hidden_layers': 18, 'num_key_value_heads': 1, 'pad_token_id': 0, 'rms_norm_eps': 1e-06, 'rope_theta': 10000.0, 'torch_dtype': 'float32', 'transformers_version': '4.48.1', 'use_adarms': True, 'use_cache': True, 'vocab_size': 257152}, 'freeze_llm_backbone': False, 'freeze_vision_backbone': False, 'pretrained_name_or_path': './checkpoints/pi05_libero/model.safetensors', 'name_mapping': {'llm_backbone': 'paligemma_with_expert.paligemma.model.language_model', 'vision_backbone.vision': 'paligemma_with_expert.paligemma.model.vision_tower', 'projector.projector': 'paligemma_with_expert.paligemma.model.multi_modal_projector.linear', 'llm_expert': 'paligemma_with_expert.gemma_expert.model', 'time_mlp_in.projector': 'time_mlp_in', 'time_mlp_out.projector': 'time_mlp_out', 'action_in_proj.projector': 'action_in_proj', 'action_out_proj.projector': 'action_out_proj', 'llm_backbone.embed_tokens': 'paligemma_with_expert.paligemma.lm_head'}}, 'train_dataloader': {'per_device_batch_size': 8, 'per_device_num_workers': 4, 'dataset': {'type': 'DistributedRepeatingDataset', 'name_mappings': {'observation.state': ['proprio'], 'action': ['action']}, 'statistic_keys': ['observation.state', 'timestamp', 'action'], 'statistic_name': 'libero_10_no_noops', 'datasets': {'type': 'ParquetDataset', 'data_root_path': './datasets/libero_10_no_noops_lerobotv2.1', 'transforms': [{'type': 'ProcessParquetInputs', 'parquet_keys': ['observation.state', 'timestamp', 'actions', 'info', 'stats', 'action_masks'], 'video_keys': ['observation.images.image', 'observation.images.wrist_image'], 'name_mappings': {'observation.state': ['states'], 'actions': ['actions']}}, {'type': 'ParquetPrompter', 'use_conversation': False}, {'type': 'ProcessPrompts', 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'ResizeImages', 'height': 224, 'width': 224}, {'type': 'NormalizeImages', 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'NormalizeStatesAndActions', 'action_dim': 32, 'state_dim': 32, 'state_key': 'proprio', 'action_key': 'action', 'norm_type': 'mean_std'}], 'action_window_size': 10, 'action_key': 'action', 'use_delta': False, 'statistic_name': 'libero_10_no_noops', 'window_start_idx': 0}}}, 'runner': {'type': 'FSDPTrainRunner', 'max_epochs': 24, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'max_grad_norm': 1.0, 'sharding_strategy': 'no-shard', 'collator': {'type': 'DictCollator', 'keys': ['states', 'observation.eepose', 'timestamp', 'images', 'img_masks', 'lang_tokens', 'lang_masks', 'actions', 'action_masks'], 'meta_keys': ['task_description', 'prompt', 'info', 'stats']}, 'sampler': None, 'tokenizer': {'type': 'PaligemmaTokenizer'}, 'metric': {'type': 'VLAMetric', 'active_trackers': ['jsonl', 'tensorboard'], 'run_dir': '/limx/tos/users/jikun/wk_dir/pi05/libero_10', 'grad_accumulation_steps': 1, 'window_size': 1, 'hparams': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'run_id': 'pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10'}, 'lr_scheduler_type': 'linear-warmup+cosine-decay', 'warmup_ratio': 0.03, 'enable_gradient_checkpointing': True, 'enable_mixed_precision_training': True, 'mixed_precision_dtype': 'bf16', 'change_key_name': False, 'cfg': Config (path: configs/pi05/pi05_paligemma_libero_10_full_finetune.py): {...}, 'args': Namespace(config='configs/pi05/pi05_paligemma_libero_10_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/pi05/libero_10', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'LiberoPromptFromInputs', 'use_conversation': False, 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'LiberoProprioFromInputs', 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'state_dim': 32, 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std', 'action_dim': 7}}}, 'args': Namespace(config='configs/pi05/pi05_paligemma_libero_10_full_finetune.py', work_dir='/limx/tos/users/jikun/wk_dir/pi05/libero_10', cfg_options={'runner.metric.active_trackers': ['jsonl', 'tensorboard']}, eval_after_train=True, resume_from=None)}, 'eval': {'type': 'LiberoEvalRunner', 'task_suite_name': 'libero_10', 'model_family': 'pi0', 'eval_chunk_size': 10, 'resize_size': 224, 'num_trials_per_task': 50, 'num_steps_wait': 10, 'seed': 7, 'dataset': {'type': 'LiberoParquetEvalDataset', 'transforms': [{'type': 'ProcessLiberoEvalInputs', 'img_keys': ['agentview_image', 'robot0_eye_in_hand_image']}, {'type': 'TransformImage', 'image_resize_strategy': 'resize-naive', 'input_sizes': [[3, 224, 224], [3, 224, 224]], 'means': [[123.515625, 116.04492188, 103.59375], [123.515625, 116.04492188, 103.59375]], 'stds': [[58.27148438, 57.02636719, 57.27539062], [58.27148438, 57.02636719, 57.27539062]]}, {'type': 'LiberoPromptFromInputs', 'use_conversation': False, 'tokenizer': {'type': 'PaligemmaTokenizer'}}, {'type': 'LiberoProprioFromInputs', 'norm_type': 'mean_std', 'pos_key': 'robot0_eef_pos', 'quat_key': 'robot0_eef_quat', 'gripper_key': 'robot0_gripper_qpos', 'state_dim': 32, 'out_key': 'states'}]}, 'denormalize_action': {'type': 'DenormalizeLiberoAction', 'norm_type': 'mean_std', 'action_dim': 7}}}", "run_id": "pi05_paligemma_libero_10_full_finetune_2026_05_15_09_15_10"}
pi05_paligemma_libero_goal_full_finetune_bs64/checkpoints/step-019848-epoch-24-loss=0.0145.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54ef053d4c29540dd2ec21154716b433c05333173528b3c0d2abd740835d42f9
3
+ size 7233625688
pi05_paligemma_libero_goal_full_finetune_bs64/config.json ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval": {
3
+ "dataset": {
4
+ "transforms": [
5
+ {
6
+ "img_keys": [
7
+ "agentview_image",
8
+ "robot0_eye_in_hand_image"
9
+ ],
10
+ "type": "ProcessLiberoEvalInputs"
11
+ },
12
+ {
13
+ "image_resize_strategy": "resize-naive",
14
+ "input_sizes": [
15
+ [
16
+ 3,
17
+ 224,
18
+ 224
19
+ ],
20
+ [
21
+ 3,
22
+ 224,
23
+ 224
24
+ ]
25
+ ],
26
+ "means": [
27
+ [
28
+ 123.515625,
29
+ 116.04492188,
30
+ 103.59375
31
+ ],
32
+ [
33
+ 123.515625,
34
+ 116.04492188,
35
+ 103.59375
36
+ ]
37
+ ],
38
+ "stds": [
39
+ [
40
+ 58.27148438,
41
+ 57.02636719,
42
+ 57.27539062
43
+ ],
44
+ [
45
+ 58.27148438,
46
+ 57.02636719,
47
+ 57.27539062
48
+ ]
49
+ ],
50
+ "type": "TransformImage"
51
+ },
52
+ {
53
+ "tokenizer": {
54
+ "type": "PaligemmaTokenizer"
55
+ },
56
+ "type": "LiberoPromptFromInputs",
57
+ "use_conversation": false
58
+ },
59
+ {
60
+ "gripper_key": "robot0_gripper_qpos",
61
+ "norm_type": "mean_std",
62
+ "out_key": "states",
63
+ "pos_key": "robot0_eef_pos",
64
+ "quat_key": "robot0_eef_quat",
65
+ "state_dim": 32,
66
+ "type": "LiberoProprioFromInputs"
67
+ }
68
+ ],
69
+ "type": "LiberoParquetEvalDataset"
70
+ },
71
+ "denormalize_action": {
72
+ "action_dim": 7,
73
+ "norm_type": "mean_std",
74
+ "type": "DenormalizeLiberoAction"
75
+ },
76
+ "eval_chunk_size": 10,
77
+ "model_family": "pi0",
78
+ "num_steps_wait": 10,
79
+ "num_trials_per_task": 50,
80
+ "resize_size": 224,
81
+ "seed": 7,
82
+ "task_suite_name": "libero_goal",
83
+ "type": "LiberoEvalRunner"
84
+ },
85
+ "model": {
86
+ "action_in_proj": {
87
+ "in_dim": 32,
88
+ "out_dim": 1024,
89
+ "type": "LinearProjector"
90
+ },
91
+ "action_out_proj": {
92
+ "in_dim": 1024,
93
+ "out_dim": 32,
94
+ "type": "LinearProjector"
95
+ },
96
+ "freeze_llm_backbone": false,
97
+ "freeze_vision_backbone": false,
98
+ "llm_backbone": {
99
+ "adarms_cond_dim": null,
100
+ "attention_bias": false,
101
+ "attention_dropout": 0.0,
102
+ "bos_token_id": 2,
103
+ "eos_token_id": 1,
104
+ "head_dim": 256,
105
+ "hidden_act": "gelu_pytorch_tanh",
106
+ "hidden_activation": "gelu_pytorch_tanh",
107
+ "hidden_size": 2048,
108
+ "initializer_range": 0.02,
109
+ "intermediate_size": 16384,
110
+ "max_position_embeddings": 8192,
111
+ "model_type": "gemma",
112
+ "num_attention_heads": 8,
113
+ "num_hidden_layers": 18,
114
+ "num_key_value_heads": 1,
115
+ "rms_norm_eps": 1e-06,
116
+ "rope_theta": 10000.0,
117
+ "torch_dtype": "float32",
118
+ "type": "ConditionGemmaModel",
119
+ "use_cache": true,
120
+ "vocab_size": 257152
121
+ },
122
+ "llm_expert": {
123
+ "adarms_cond_dim": 1024,
124
+ "attention_bias": false,
125
+ "attention_dropout": 0.0,
126
+ "bos_token_id": 2,
127
+ "eos_token_id": 1,
128
+ "head_dim": 256,
129
+ "hidden_act": "gelu_pytorch_tanh",
130
+ "hidden_activation": "gelu_pytorch_tanh",
131
+ "hidden_size": 1024,
132
+ "initializer_range": 0.02,
133
+ "intermediate_size": 4096,
134
+ "max_position_embeddings": 8192,
135
+ "model_type": "gemma",
136
+ "num_attention_heads": 8,
137
+ "num_hidden_layers": 18,
138
+ "num_key_value_heads": 1,
139
+ "pad_token_id": 0,
140
+ "rms_norm_eps": 1e-06,
141
+ "rope_theta": 10000.0,
142
+ "torch_dtype": "float32",
143
+ "transformers_version": "4.48.1",
144
+ "type": "ConditionGemmaModel",
145
+ "use_adarms": true,
146
+ "use_cache": true,
147
+ "vocab_size": 257152
148
+ },
149
+ "max_action_dim": 32,
150
+ "n_action_steps": 10,
151
+ "name_mapping": {
152
+ "action_in_proj.projector": "action_in_proj",
153
+ "action_out_proj.projector": "action_out_proj",
154
+ "llm_backbone": "paligemma_with_expert.paligemma.model.language_model",
155
+ "llm_backbone.embed_tokens": "paligemma_with_expert.paligemma.lm_head",
156
+ "llm_expert": "paligemma_with_expert.gemma_expert.model",
157
+ "projector.projector": "paligemma_with_expert.paligemma.model.multi_modal_projector.linear",
158
+ "time_mlp_in.projector": "time_mlp_in",
159
+ "time_mlp_out.projector": "time_mlp_out",
160
+ "vision_backbone.vision": "paligemma_with_expert.paligemma.model.vision_tower"
161
+ },
162
+ "pretrained_name_or_path": "./checkpoints/pi05_libero/model.safetensors",
163
+ "proj_width": 1024,
164
+ "projector": {
165
+ "in_dim": 1152,
166
+ "out_dim": 2048,
167
+ "type": "LinearProjector"
168
+ },
169
+ "time_mlp_in": {
170
+ "in_dim": 1024,
171
+ "out_dim": 1024,
172
+ "type": "LinearProjector"
173
+ },
174
+ "time_mlp_out": {
175
+ "in_dim": 1024,
176
+ "out_dim": 1024,
177
+ "type": "LinearProjector"
178
+ },
179
+ "type": "PI05FlowMatching",
180
+ "vision_backbone": {
181
+ "type": "SigLIPViTBackbone",
182
+ "vision_backbone_id": "siglip_224",
183
+ "vision_config": {
184
+ "attention_dropout": 0.0,
185
+ "hidden_act": "gelu_pytorch_tanh",
186
+ "hidden_size": 1152,
187
+ "image_size": 224,
188
+ "intermediate_size": 4304,
189
+ "layer_norm_eps": 1e-06,
190
+ "model_type": "siglip_vision_model",
191
+ "num_attention_heads": 16,
192
+ "num_channels": 3,
193
+ "num_hidden_layers": 27,
194
+ "patch_size": 14,
195
+ "projection_dim": 2048,
196
+ "projector_hidden_act": "gelu_fast",
197
+ "torch_dtype": "float32",
198
+ "vision_use_head": false
199
+ }
200
+ }
201
+ },
202
+ "runner": {
203
+ "change_key_name": false,
204
+ "collator": {
205
+ "keys": [
206
+ "states",
207
+ "observation.eepose",
208
+ "timestamp",
209
+ "images",
210
+ "img_masks",
211
+ "lang_tokens",
212
+ "lang_masks",
213
+ "actions",
214
+ "action_masks"
215
+ ],
216
+ "meta_keys": [
217
+ "task_description",
218
+ "prompt",
219
+ "info",
220
+ "stats"
221
+ ],
222
+ "type": "DictCollator"
223
+ },
224
+ "enable_gradient_checkpointing": true,
225
+ "enable_mixed_precision_training": true,
226
+ "learning_rate": 5e-05,
227
+ "lr_scheduler_type": "linear-warmup+cosine-decay",
228
+ "max_epochs": 24,
229
+ "max_grad_norm": 1.0,
230
+ "metric": {
231
+ "active_trackers": [
232
+ "jsonl",
233
+ "tensorboard"
234
+ ],
235
+ "grad_accumulation_steps": 1,
236
+ "run_dir": "work_dirs",
237
+ "type": "VLAMetric",
238
+ "window_size": 1
239
+ },
240
+ "mixed_precision_dtype": "bf16",
241
+ "sampler": null,
242
+ "sharding_strategy": "no-shard",
243
+ "tokenizer": {
244
+ "type": "PaligemmaTokenizer"
245
+ },
246
+ "type": "FSDPTrainRunner",
247
+ "warmup_ratio": 0.03,
248
+ "weight_decay": 0.0
249
+ },
250
+ "train_dataloader": {
251
+ "dataset": {
252
+ "datasets": {
253
+ "action_key": "action",
254
+ "action_window_size": 10,
255
+ "data_root_path": "./datasets/libero_goal_no_noops_lerobotv2.1",
256
+ "statistic_name": "libero_goal_no_noops",
257
+ "transforms": [
258
+ {
259
+ "name_mappings": {
260
+ "actions": [
261
+ "actions"
262
+ ],
263
+ "observation.state": [
264
+ "states"
265
+ ]
266
+ },
267
+ "parquet_keys": [
268
+ "observation.state",
269
+ "timestamp",
270
+ "actions",
271
+ "info",
272
+ "stats",
273
+ "action_masks"
274
+ ],
275
+ "type": "ProcessParquetInputs",
276
+ "video_keys": [
277
+ "observation.images.image",
278
+ "observation.images.wrist_image"
279
+ ]
280
+ },
281
+ {
282
+ "type": "ParquetPrompter",
283
+ "use_conversation": false
284
+ },
285
+ {
286
+ "tokenizer": {
287
+ "type": "PaligemmaTokenizer"
288
+ },
289
+ "type": "ProcessPrompts"
290
+ },
291
+ {
292
+ "height": 224,
293
+ "type": "ResizeImages",
294
+ "width": 224
295
+ },
296
+ {
297
+ "means": [
298
+ [
299
+ 123.515625,
300
+ 116.04492188,
301
+ 103.59375
302
+ ],
303
+ [
304
+ 123.515625,
305
+ 116.04492188,
306
+ 103.59375
307
+ ]
308
+ ],
309
+ "stds": [
310
+ [
311
+ 58.27148438,
312
+ 57.02636719,
313
+ 57.27539062
314
+ ],
315
+ [
316
+ 58.27148438,
317
+ 57.02636719,
318
+ 57.27539062
319
+ ]
320
+ ],
321
+ "type": "NormalizeImages"
322
+ },
323
+ {
324
+ "action_dim": 32,
325
+ "action_key": "action",
326
+ "norm_type": "mean_std",
327
+ "state_dim": 32,
328
+ "state_key": "proprio",
329
+ "type": "NormalizeStatesAndActions"
330
+ }
331
+ ],
332
+ "type": "ParquetDataset",
333
+ "use_delta": false,
334
+ "window_start_idx": 0
335
+ },
336
+ "name_mappings": {
337
+ "action": [
338
+ "action"
339
+ ],
340
+ "observation.state": [
341
+ "proprio"
342
+ ]
343
+ },
344
+ "statistic_keys": [
345
+ "observation.state",
346
+ "timestamp",
347
+ "action"
348
+ ],
349
+ "statistic_name": "libero_goal_no_noops",
350
+ "type": "DistributedRepeatingDataset"
351
+ },
352
+ "per_device_batch_size": 8,
353
+ "per_device_num_workers": 4
354
+ }
355
+ }