-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain-lerobot-standalone.sh
More file actions
executable file
·227 lines (197 loc) · 6.55 KB
/
train-lerobot-standalone.sh
File metadata and controls
executable file
·227 lines (197 loc) · 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/bin/bash
# SmolVLA Standalone Training Script
# This script runs standalone LeRobot training using conda (default) or Docker for reproducible execution
set -e # Exit on any error
# Default values
STEPS=${STEPS:-2}
MODE=${MODE:-conda} # Default to conda, can be set to 'docker'
DOCKER_IMAGE=${DOCKER_IMAGE:-zk0}
DATASET_REPO_ID="" # Must be provided by user
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_info() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to show usage
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Run SmolVLA standalone training using conda (default) or Docker"
echo ""
echo "Options:"
echo " -d, --dataset REPO Hugging Face dataset repo ID (required)"
echo " -s, --steps NUM Number of training steps (default: 200)"
echo " -m, --mode MODE Execution mode: 'conda' or 'docker' (default: conda)"
echo " -i, --image NAME Docker image name (default: zk0, used only with --mode docker)"
echo " -h, --help Show this help message"
echo ""
echo "Environment Variables:"
echo " MODE=docker Use Docker instead of conda"
echo " DOCKER_IMAGE=zk0 Docker image to use"
echo ""
echo "Examples:"
echo " $0 # Run with defaults (conda mode, 200 steps)"
echo " $0 -s 1000 # Run 1000 steps with conda"
echo " $0 -m docker # Use Docker instead of conda"
echo " $0 -m docker -i custom-image # Use custom Docker image"
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-d|--dataset)
DATASET_REPO_ID="$2"
shift 2
;;
-s|--steps)
STEPS="$2"
shift 2
;;
-m|--mode)
MODE="$2"
shift 2
;;
-i|--image)
DOCKER_IMAGE="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
print_error "Unknown option: $1"
usage
exit 1
;;
esac
done
# Validate inputs
if [[ -z "$DATASET_REPO_ID" ]]; then
print_error "Dataset repo ID is required. Use -d or --dataset"
usage
exit 1
fi
if ! [[ "$STEPS" =~ ^[0-9]+$ ]] || [ "$STEPS" -lt 1 ]; then
print_error "Steps must be a positive integer"
exit 1
fi
if [[ "$MODE" != "conda" && "$MODE" != "docker" ]]; then
print_error "Mode must be either 'conda' or 'docker'"
exit 1
fi
print_info "Starting SmolVLA Standalone Training"
print_info "===================================="
print_info "Dataset: $DATASET_REPO_ID"
print_info "Steps: $STEPS"
print_info "Mode: $MODE"
if [[ "$MODE" == "docker" ]]; then
print_info "Docker Image: $DOCKER_IMAGE"
fi
print_info ""
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DATASET_SAFE=$(echo "$DATASET_REPO_ID" | tr '/' '_')
OUTPUT_DIR="outputs/train/${TIMESTAMP}_${DATASET_SAFE}_${STEPS}steps"
print_info "Output dir: $OUTPUT_DIR"
# Function to execute conda training
execute_conda_training() {
print_info "Using conda environment: zk0"
# Check if conda is available
if ! command -v conda &> /dev/null; then
print_error "Conda is not installed or not in PATH"
exit 1
fi
# Check if the zk0 environment exists
if ! conda info --envs | grep -q "zk0"; then
print_error "Conda environment 'zk0' not found"
print_error "Please create it first with: conda create -n zk0 python=3.10"
exit 1
fi
# Check if GPU is available
if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
print_info "GPU detected and available"
else
print_warning "GPU not detected. Training will use CPU."
fi
# Execute the training command
conda run -n zk0 python -m lerobot.scripts.train \
--policy.path=lerobot/smolvla_base \
--dataset.repo_id="$DATASET_REPO_ID" \
--batch_size=64 \
--steps=$STEPS \
--output_dir="$OUTPUT_DIR" \
--job_name=my_smolvla_training \
--policy.device=cuda --policy.push_to_hub=false
}
# Function to execute docker training
execute_docker_training() {
print_info "Using Docker image: $DOCKER_IMAGE"
# Check if Docker is available
if ! command -v docker &> /dev/null; then
print_error "Docker is not installed or not in PATH"
exit 1
fi
# Check if Docker daemon is running
if ! docker info &> /dev/null; then
print_error "Docker daemon is not running"
exit 1
fi
# Check if GPU support is available
if ! docker run --rm --gpus all $DOCKER_IMAGE nvidia-smi &> /dev/null; then
print_warning "GPU support not available in Docker. Training will use CPU."
fi
# Build the Docker command
DOCKER_CMD="docker run --gpus all --shm-size=10.24gb"
DOCKER_CMD="$DOCKER_CMD -v $(pwd):/workspace"
DOCKER_CMD="$DOCKER_CMD -v $(pwd)/outputs:/workspace/outputs"
DOCKER_CMD="$DOCKER_CMD -v /tmp:/tmp"
# Mount Hugging Face cache directory for model persistence
DOCKER_CMD="$DOCKER_CMD -v $HOME/.cache/huggingface:/home/user_lerobot/.cache/huggingface"
DOCKER_CMD="$DOCKER_CMD -w /workspace"
DOCKER_CMD="$DOCKER_CMD $DOCKER_IMAGE"
DOCKER_CMD="$DOCKER_CMD sh -c \"uv pip install --no-cache-dir --no-build-isolation -r requirements.txt && PYTHONPATH=/workspace python -m lerobot.scripts.train \
--policy.path=lerobot/smolvla_base \
--dataset.repo_id=\"$DATASET_REPO_ID\" \
--batch_size=64 \
--steps=$STEPS \
--output_dir="$OUTPUT_DIR" \
--job_name=my_smolvla_training \
--policy.device=cuda --push_to_hub=false
\""
print_info "Executing Docker command:"
print_info "$DOCKER_CMD"
print_info ""
# Execute the command
eval "$DOCKER_CMD"
}
# Set Hugging Face verbosity for cache hit visibility
export HF_HUB_VERBOSITY=info
export HF_DATASETS_VERBOSITY=info
export TRANSFORMERS_VERBOSITY=info
# Execute based on mode
if [[ "$MODE" == "conda" ]]; then
execute_conda_training
elif [[ "$MODE" == "docker" ]]; then
execute_docker_training
fi
# Check exit status
if [ $? -eq 0 ]; then
print_success "SmolVLA standalone training completed successfully!"
print_info "Check the 'outputs/train/my_smolvla' directory for results and logs."
else
print_error "SmolVLA standalone training failed!"
exit 1
fi