zk0/train-lerobot-standalone.sh at main · ivelin/zk0 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/bin/bash

# SmolVLA Standalone Training Script
# This script runs standalone LeRobot training using conda (default) or Docker for reproducible execution

set -e  # Exit on any error

# Default values
STEPS=${STEPS:-2}
MODE=${MODE:-conda}  # Default to conda, can be set to 'docker'
DOCKER_IMAGE=${DOCKER_IMAGE:-zk0}
DATASET_REPO_ID=""  # Must be provided by user

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Function to print colored output
print_info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Function to show usage
usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Run SmolVLA standalone training using conda (default) or Docker"
    echo ""
    echo "Options:"
    echo "  -d, --dataset REPO     Hugging Face dataset repo ID (required)"
    echo "  -s, --steps NUM        Number of training steps (default: 200)"
    echo "  -m, --mode MODE        Execution mode: 'conda' or 'docker' (default: conda)"
    echo "  -i, --image NAME       Docker image name (default: zk0, used only with --mode docker)"
    echo "  -h, --help            Show this help message"
    echo ""
    echo "Environment Variables:"
    echo "  MODE=docker           Use Docker instead of conda"
    echo "  DOCKER_IMAGE=zk0      Docker image to use"
    echo ""
    echo "Examples:"
    echo "  $0                     # Run with defaults (conda mode, 200 steps)"
    echo "  $0 -s 1000             # Run 1000 steps with conda"
    echo "  $0 -m docker           # Use Docker instead of conda"
    echo "  $0 -m docker -i custom-image  # Use custom Docker image"
}

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        -d|--dataset)
            DATASET_REPO_ID="$2"
            shift 2
            ;;
        -s|--steps)
            STEPS="$2"
            shift 2
            ;;
        -m|--mode)
            MODE="$2"
            shift 2
            ;;
        -i|--image)
            DOCKER_IMAGE="$2"
            shift 2
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            print_error "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
done

# Validate inputs
if [[ -z "$DATASET_REPO_ID" ]]; then
    print_error "Dataset repo ID is required. Use -d or --dataset"
    usage
    exit 1
fi

if ! [[ "$STEPS" =~ ^[0-9]+$ ]] || [ "$STEPS" -lt 1 ]; then
    print_error "Steps must be a positive integer"
    exit 1
fi

if [[ "$MODE" != "conda" && "$MODE" != "docker" ]]; then
    print_error "Mode must be either 'conda' or 'docker'"
    exit 1
fi

print_info "Starting SmolVLA Standalone Training"
print_info "===================================="
print_info "Dataset: $DATASET_REPO_ID"
print_info "Steps: $STEPS"
print_info "Mode: $MODE"
if [[ "$MODE" == "docker" ]]; then
    print_info "Docker Image: $DOCKER_IMAGE"
fi
print_info ""
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DATASET_SAFE=$(echo "$DATASET_REPO_ID" | tr '/' '_')
OUTPUT_DIR="outputs/train/${TIMESTAMP}_${DATASET_SAFE}_${STEPS}steps"
print_info "Output dir: $OUTPUT_DIR"

# Function to execute conda training
execute_conda_training() {
    print_info "Using conda environment: zk0"

    # Check if conda is available
    if ! command -v conda &> /dev/null; then
        print_error "Conda is not installed or not in PATH"
        exit 1
    fi

    # Check if the zk0 environment exists
    if ! conda info --envs | grep -q "zk0"; then
        print_error "Conda environment 'zk0' not found"
        print_error "Please create it first with: conda create -n zk0 python=3.10"
        exit 1
    fi

    # Check if GPU is available
    if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then
        print_info "GPU detected and available"
    else
        print_warning "GPU not detected. Training will use CPU."
    fi


    # Execute the training command
    conda run -n zk0 python -m lerobot.scripts.train \
      --policy.path=lerobot/smolvla_base \
      --dataset.repo_id="$DATASET_REPO_ID" \
      --batch_size=64 \
      --steps=$STEPS \
      --output_dir="$OUTPUT_DIR" \
      --job_name=my_smolvla_training \
      --policy.device=cuda --policy.push_to_hub=false
}

# Function to execute docker training
execute_docker_training() {
    print_info "Using Docker image: $DOCKER_IMAGE"

    # Check if Docker is available
    if ! command -v docker &> /dev/null; then
        print_error "Docker is not installed or not in PATH"
        exit 1
    fi

    # Check if Docker daemon is running
    if ! docker info &> /dev/null; then
        print_error "Docker daemon is not running"
        exit 1
    fi

    # Check if GPU support is available
    if ! docker run --rm --gpus all $DOCKER_IMAGE nvidia-smi &> /dev/null; then
        print_warning "GPU support not available in Docker. Training will use CPU."
    fi

    # Build the Docker command
    DOCKER_CMD="docker run --gpus all --shm-size=10.24gb"
    DOCKER_CMD="$DOCKER_CMD -v $(pwd):/workspace"
    DOCKER_CMD="$DOCKER_CMD -v $(pwd)/outputs:/workspace/outputs"
    DOCKER_CMD="$DOCKER_CMD -v /tmp:/tmp"
    # Mount Hugging Face cache directory for model persistence
    DOCKER_CMD="$DOCKER_CMD -v $HOME/.cache/huggingface:/home/user_lerobot/.cache/huggingface"
    DOCKER_CMD="$DOCKER_CMD -w /workspace"
    DOCKER_CMD="$DOCKER_CMD $DOCKER_IMAGE"
    DOCKER_CMD="$DOCKER_CMD sh -c \"uv pip install --no-cache-dir --no-build-isolation -r requirements.txt && PYTHONPATH=/workspace python -m lerobot.scripts.train \
      --policy.path=lerobot/smolvla_base \
      --dataset.repo_id=\"$DATASET_REPO_ID\" \
      --batch_size=64 \
      --steps=$STEPS \
      --output_dir="$OUTPUT_DIR" \
      --job_name=my_smolvla_training \
      --policy.device=cuda --push_to_hub=false
      \""

    print_info "Executing Docker command:"
    print_info "$DOCKER_CMD"
    print_info ""

    # Execute the command
    eval "$DOCKER_CMD"
}

# Set Hugging Face verbosity for cache hit visibility
export HF_HUB_VERBOSITY=info
export HF_DATASETS_VERBOSITY=info
export TRANSFORMERS_VERBOSITY=info

# Execute based on mode
if [[ "$MODE" == "conda" ]]; then
    execute_conda_training
elif [[ "$MODE" == "docker" ]]; then
    execute_docker_training
fi

# Check exit status
if [ $? -eq 0 ]; then
    print_success "SmolVLA standalone training completed successfully!"
    print_info "Check the 'outputs/train/my_smolvla' directory for results and logs."
else
    print_error "SmolVLA standalone training failed!"
    exit 1
fi