diff --git a/Jenkinsfile.modified b/Jenkinsfile.modified new file mode 100644 index 0000000..3201c28 --- /dev/null +++ b/Jenkinsfile.modified @@ -0,0 +1,192 @@ +/* + * LSTM — Inference Evaluation Pipeline + */ +pipeline { + agent any + + environment { + BUILD_CTX = "${WORKSPACE}" + METRICS_DIR = "${WORKSPACE}/eval_metrics/lstm" + + MODEL_NAME = "lstm" + IMAGE_GPU = "py-lstm-gpu" + + CONTAINER_NAME = "eval-lstm" + EVAL_PORT = "8002" + + PATH = "/var/lib/jenkins/.local/bin:/home/ajitesh/.local/bin:${env.PATH}" + } + + stages { + stage('Setup') { + steps { + sh "mkdir -p ${METRICS_DIR}" + } + } + + stage('Build Image') { + steps { + script { + buildImg(IMAGE_GPU, 'python_ml/pytorch/LSTM/Inference/Dockerfile') + } + } + } + + stage('Image Metrics') { + steps { + script { + measureImg(IMAGE_GPU) + } + } + } + + stage('Evaluate') { + steps { + script { + evaluate(IMAGE_GPU, EVAL_PORT) + } + } + } + } + + post { + always { + sh "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" + + archiveArtifacts artifacts: 'eval_metrics/lstm/**/*', allowEmptyArchive: true + } + } +} + +def buildImg(String image, String dockerfile) { + def t0 = System.currentTimeMillis() + def status = 'SUCCESS' + + try { + sh "docker build -t ${image} -f ${dockerfile} ${BUILD_CTX}" + } catch (Exception e) { + status = 'FAILURE' + throw e + } finally { + def dur = System.currentTimeMillis() - t0 + def ts = new Date().format("yyyy-MM-dd'T'HH:mm:ss'Z'") + + sh """ + cat > ${METRICS_DIR}/${image}_build.json </dev/null || echo 0) + LC=\$(docker history -q ${image} 2>/dev/null | wc -l) + SM=\$(echo "scale=2; \$SB / 1048576" | bc) + + cat > ${METRICS_DIR}/${image}_image.json </dev/null || true" + + sh """ + T0=\$(date +%s%3N) + + docker run -d \ + --name ${CONTAINER_NAME} \ + -p ${port}:8000 \ + -v ${METRICS_DIR}:/app/devops_metrics \ + ${image} + + T1=\$(date +%s%3N) + + echo "Container launched, waiting for readiness..." + + READY=0 + + for i in \$(seq 1 300); do + HEALTH_STATUS=\$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' ${CONTAINER_NAME} 2>/dev/null || echo "missing") + + if [ "\$HEALTH_STATUS" = "healthy" ]; then + READY=1 + break + fi + + if curl -sf http://localhost:${port}/health > /dev/null 2>&1; then + READY=1 + break + fi + + sleep 0.1 + done + + T2=\$(date +%s%3N) + + CONTAINER_START_MS=\$((T1 - T0)) + APP_READY_MS=\$((T2 - T1)) + TOTAL_COLD_START_MS=\$((T2 - T0)) + + if [ "\$READY" -eq 1 ]; then + HEALTH_PAYLOAD=\$(curl -sf http://localhost:${port}/health 2>/dev/null || echo '{}') + + cat > ${METRICS_DIR}/cold_start.json < ${METRICS_DIR}/cold_start.json < ${METRICS_DIR}/app_metrics.json || echo '{}' > ${METRICS_DIR}/app_metrics.json + + docker stats --no-stream --format '{"dimension":"5_resources","cpu":"{{.CPUPerc}}","mem":"{{.MemUsage}}"}' ${CONTAINER_NAME} > ${METRICS_DIR}/docker_stats.json || true + + docker inspect ${CONTAINER_NAME} > ${METRICS_DIR}/container_inspect.json || true + """ + + sh "docker rm -f ${CONTAINER_NAME} 2>/dev/null || true" +} \ No newline at end of file diff --git a/latex_reports/draft_1.tex b/latex_reports/draft_1.tex new file mode 100644 index 0000000..042886f --- /dev/null +++ b/latex_reports/draft_1.tex @@ -0,0 +1,2235 @@ +\documentclass{ieeeaccess} +\usepackage{cite} +\usepackage{amsmath,amssymb,amsfonts} +\usepackage{algorithmic} +\usepackage{textcomp} + +\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em + T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}} + +% Encoding and fonts +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{pdfpages} + +% Math and graphics +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{array} +\usepackage{float} + +% URLs (robust line breaking) +\usepackage{url} +\usepackage[hidelinks]{hyperref} +\def\UrlBreaks{\do/\do-\do_} + +% Code listings (stable for IEEE) +\usepackage{listings} +\usepackage{xcolor} +\lstset{ + basicstyle=\ttfamily\footnotesize, + breaklines=true, + breakatwhitespace=false, % <-- IMPORTANT (change this) + columns=fullflexible, + keepspaces=true, + showstringspaces=false +} + +% Custom Dockerfile language +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]" +} + +\begin{document} +\history{Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.} +\doi{10.1109/ACCESS.2017.DOI} + +\title{System-Level Evaluation of Rust and Python for Machine Learning} +\author{\uppercase{Project Elective}\authorrefmark{1}, +\IEEEmembership{Member, IEEE}} +\address[1]{Project Elective (e-mail: project@elective.com)} +\tfootnote{This paragraph of the first footnote will contain support +information, including sponsor and financial support acknowledgment. For +example, ``This work was supported in part by the U.S. Department of +Commerce under Grant BS123456.''} + +\markboth +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} + +\corresp{Corresponding author: Project Elective (e-mail: project@elective.com).} + +\begin{abstract} +These instructions give you guidelines for preparing papers for +IEEE Access. Use this document as a template if you are +using \LaTeX. Otherwise, use this document as an +instruction set. The electronic file of your paper will be formatted further +at IEEE. Paper titles should be written in uppercase and lowercase letters, +not all uppercase. Avoid writing long formulas with subscripts in the title; +short formulas that identify the elements are fine (e.g., "Nd--Fe--B"). Do +not write ``(Invited)'' in the title. Full names of authors are preferred in +the author field, but are not required. Put a space between authors' +initials. The abstract must be a concise yet comprehensive reflection of +what is in your article. In particular, the abstract must be self-contained, +without abbreviations, footnotes, or references. It should be a microcosm of +the full article. The abstract must be between 150--250 words. Be sure that +you adhere to these limits; otherwise, you will need to edit your abstract +accordingly. The abstract must be written as one paragraph, and should not +contain displayed mathematical equations or tabular material. The abstract +should include three or four different keywords or phrases, as this will +help readers to find it. It is important to avoid over-repetition of such +phrases as this can result in a page being rejected by search engines. +Ensure that your abstract reads well and is grammatically correct. +\end{abstract} + +\begin{keywords} +Enter key words or phrases in alphabetical +order, separated by commas. For a list of suggested keywords, send a blank +e-mail to keywords@ieee.org or visit \underline +{http://www.ieee.org/organizations/pubs/ani\_prod/keywrd98.txt} +\end{keywords} + +\titlepgskip=-15pt + +\maketitle +\section{Overview of the Project} + +This project studies the use of \textbf{Rust} as an alternative systems language for machine learning workflows traditionally implemented in \textbf{Python}. +Rather than focusing on state-of-the-art model performance, the emphasis is on: + +\begin{itemize} + \item feasibility of end-to-end ML workflows, + \item system stability and reproducibility, + \item developer experience and DevOps complexity, + \item deployment and operational characteristics. +\end{itemize} + +To ensure clarity and rigor, the work is organized into \textbf{two clearly separated experimental tracks}. + +\hrule + +\section{Project Structure: Two-Track Evaluation} + +The project consists of the following two tracks: + +\subsection*{Track 1: Training-Based Systems Evaluation} +This track compares \textbf{machine learning training pipelines} implemented in: +\begin{itemize} + \item PyTorch (Python), and + \item Burn (Rust). +\end{itemize} + +The goal is to evaluate training feasibility, stability, compile-time guarantees, and DevOps impact, rather than raw training speed. + +\subsection*{Track 2: Inference-Based DevOps Evaluation} +This track compares \textbf{production-style inference services} implemented in: +\begin{itemize} + \item Python-based ONNX inference, and + \item Rust-based ONNX inference. +\end{itemize} + +The focus is on deployment, security, containerization, CI/CD behavior, and runtime efficiency. + +Each track is designed to answer a distinct research question while remaining complementary. + +\section{Machine Learning Tasks Considered} + +To ensure coverage of diverse ML workloads, the following tasks are identified: + +\begin{itemize} + \item \textbf{Text Classification}: Dataset to be finalized. + \item \textbf{Image Classification}: MNIST dataset. + \item \textbf{Credit Score Assignment}: Supervised classification task. + \item \textbf{Multi-Objective Machine Learning}: Brain Tumor dataset with a MOML formulation. + \item \textbf{Fine-Tuning Task}: BERT-based classification (ANLP Assignment 1), with optional LoRA / QLoRA. + \item \textbf{Autoregressive Decoding}: Experiments using the Burn framework. +\end{itemize} + +At the current stage, the \textbf{MNIST image classification task has been fully implemented}. +The corresponding training code is available in the project GitHub repository. + +\hrule + +\section{Related Work} + +The following research papers are being used to guide experimental design and evaluation: + +\begin{itemize} + \item \url{https://ieeexplore.ieee.org/document/11126113} + \item \url{https://ieeexplore.ieee.org/document/11261485} + \item \url{https://ieeexplore.ieee.org/document/11212348} + \item \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} +\end{itemize} + +\hrule + +\section{Code Repository and Current Status} + +Project repository: +\begin{center} +\url{https://github.com/Abhinav-Kumar012/Rust_Python_ML_PE.git} +\end{center} + +Current progress includes: +\begin{itemize} + \item MNIST training pipeline implemented + \item PyTorch baseline established + \item Initial Rust (Burn) training setup completed +\end{itemize} + +\hrule + +\section{Track 1: Training-Based Systems Evaluation} + +\subsection{Objective} + +The objective of this track is to answer the following research question: + +\begin{quote} +\textit{Can Rust realistically support end-to-end machine learning training pipelines, and what system-level trade-offs does this introduce compared to PyTorch?} +\end{quote} + +This track explicitly avoids speed-centric benchmarking and instead focuses on system behavior. + +\hrule + +\subsection{Frameworks Compared} + +\subsubsection{PyTorch (Baseline)} +\begin{itemize} + \item Language: Python + \item Training maturity: Very high + \item Ecosystem: Extensive +\end{itemize} + +\subsubsection{Rust (Burn)} +\begin{itemize} + \item Language: Rust + \item Training maturity: Emerging + \item Design: Idiomatic Rust, native training support +\end{itemize} + +\hrule + +\subsection{Experimental Controls} + +\textbf{Fixed Across Both Implementations} +\begin{itemize} + \item Dataset splits + \item Number of epochs + \item Batch size + \item Optimizer type + \item Learning rate + \item Hardware +\end{itemize} + +\textbf{Allowed Differences} +\begin{itemize} + \item Internal kernel implementations + \item Graph execution model + \item Memory management +\end{itemize} + +\hrule + +\subsection{Metrics Collected} + +\begin{itemize} + \item Training time per epoch (reported cautiously) + \item Loss curves and convergence behavior + \item Runtime failures and numerical stability + \item Reproducibility across runs + \item Environment setup and build complexity + \item Dependency footprint and artifact size +\end{itemize} + +\hrule + +\section{Track 2: Inference-Based DevOps Evaluation} + +\subsection{Objective} + +The objective of this track is to compare \textbf{deployment, security, and operational characteristics} of Python-based and Rust-based ML inference services executing the same ONNX model. + +\hrule + +\subsection{Inference Services Compared} + +\textbf{Python Service} +\begin{itemize} + \item FastAPI + Uvicorn + \item ONNX Runtime (Python) +\end{itemize} + +\textbf{Rust Service} +\begin{itemize} + \item Axum / Actix + \item burn-rs +\end{itemize} + +Both services expose identical inference endpoints and return identical outputs. + +\hrule + +\subsection{Evaluation Dimensions} + +\begin{itemize} + \item CI/CD build behavior + \item Container image size and layering + \item Cold-start latency + \item Inference latency and throughput + \item Resource utilization + \item Security and supply-chain surface +\end{itemize} + +\hrule + +\section{Upcoming Work} + +The following tasks are planned for the next phase of the project: +s +\begin{itemize} + \item Develop production-style inference services for both Python and Rust. + \item Write Dockerfiles for Python and Rust inference services. + \item Set up Jenkins-based CI pipelines for inference, including build, test, containerization, and security scanning. +\end{itemize} + +\clearpage + +\section{Task: MNIST Image Classification} +\subsection{Architecture Details} + +\begin{table*}[t!] +\centering +\renewcommand{\arraystretch}{1.3} +\begin{tabular}{|c|l|l|l|l|} +\hline +\textbf{Step} & \textbf{Layer} & \textbf{Configuration} & \textbf{Input Shape} & \textbf{Output Shape} \\ +\hline + +1 & Input & Grayscale Images & +$[B, H, W]$ & +$[B, H, W]$ \\ + +\hline +2 & Reshape & Add channel dimension & +$[B, H, W]$ & +$[B, 1, H, W]$ \\ + +\hline +3 & Conv2D (conv1) & +$1 \rightarrow 8$, kernel $3 \times 3$ & +$[B, 1, H, W]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +4 & Dropout & +$p = 0.5$ & +$[B, 8, H-2, W-2]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +5 & Conv2D (conv2) & +$8 \rightarrow 16$, kernel $3 \times 3$ & +$[B, 8, H-2, W-2]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +6 & Dropout & +$p = 0.5$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +7 & ReLU & +Activation & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +8 & Adaptive Avg Pool & +Output size $8 \times 8$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, 8, 8]$ \\ + +\hline +9 & Flatten & +$16 \times 8 \times 8$ & +$[B, 16, 8, 8]$ & +$[B, 1024]$ \\ + +\hline +10 & Linear (fc1) & +$1024 \rightarrow \texttt{hidden\_size}$ & +$[B, 1024]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +11 & Dropout & +$p = 0.5$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +12 & ReLU & +Activation & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +13 & Linear (fc2) & +$\texttt{hidden\_size} \rightarrow \texttt{num\_classes}$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{num\_classes}]$ \\ + +\hline +\end{tabular} +\caption{Detailed architecture of the convolutional neural network implemented in Burn. +$B$ denotes batch size, $H$ and $W$ denote input image height and width respectively.} +\label{tab:burn-cnn-architecture} +\end{table*} + +\noindent\textbf{Notes:} +\begin{itemize} + \item All convolution layers use default stride = 1 and no padding. + \item Dropout probability is configurable via \texttt{ModelConfig.dropout}. + \item Adaptive average pooling ensures a fixed spatial resolution regardless of input size. + \item The model is fully differentiable and backend-agnostic via Burn's \texttt{Backend} trait. +\end{itemize} + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/MNIST.pdf} + + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/MNIST.pdf} + + +\subsection{Rust: Dockerfile Design and Containerization Strategy} + +The Dockerfile used for the Rust-based MNIST inference application follows a multi-stage build strategy. Multi-stage builds are commonly used to reduce the size of the final container image by separating the compilation environment from the runtime environment. + +\subsubsection{Overview of Multi-Stage Build} + +The Dockerfile is divided into two major stages: + +\begin{enumerate} + \item Builder Stage + \item Runtime Stage +\end{enumerate} + +The builder stage is responsible for compiling the Rust application, while the runtime stage contains only the compiled binary and the required runtime dependencies. + +\subsubsection{Builder Stage} + +The first stage begins with: + +\begin{lstlisting} +FROM ubuntu:16.04 AS builder +\end{lstlisting} + +This instruction uses Ubuntu 16.04 as the base image for building the Rust application. The alias \texttt{builder} is assigned to this stage so that its outputs can later be referenced in the runtime stage. + +\paragraph{Working Directory} + +\begin{lstlisting} +WORKDIR /app/rust_ml +\end{lstlisting} + +The \texttt{WORKDIR} instruction sets the default working directory inside the container to: + +\begin{lstlisting} +/app/rust_ml +\end{lstlisting} + +All subsequent commands in the builder stage are executed relative to this directory. + +\paragraph{Installing Build Dependencies} + +The following command installs the required packages for compiling the Rust project: + +\begin{lstlisting} +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* +\end{lstlisting} + +Each package serves a specific purpose: + +\begin{itemize} + \item \texttt{curl}: Used to download external files, including the Rust installation script. + \item \texttt{build-essential}: Provides common compilation tools such as \texttt{gcc}, \texttt{g++}, and \texttt{make}. + \item \texttt{pkg-config}: Helps discover system libraries during the build process. + \item \texttt{ca-certificates}: Ensures secure HTTPS communication when downloading dependencies. +\end{itemize} + +The final cleanup command: + +\begin{lstlisting} +rm -rf /var/lib/apt/lists/* +\end{lstlisting} + +removes cached package lists to reduce image size. + +\paragraph{Installing Rust} + +Rust is installed using the official Rust installer: + +\begin{lstlisting} +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +\end{lstlisting} + +This command downloads and executes the \texttt{rustup} installer. + +The flags used have the following meanings: + +\begin{itemize} + \item \texttt{--proto '=https'}: Restricts downloads to HTTPS only. + \item \texttt{--tlsv1.2}: Forces the use of TLS version 1.2 for secure transport. + \item \texttt{-sSf}: Makes \texttt{curl} silent while still showing errors if the download fails. + \item \texttt{-y}: Automatically accepts all installation prompts. +\end{itemize} + +After Rust is installed, the PATH environment variable is updated: + +\begin{lstlisting} +ENV PATH="/root/.cargo/bin:${PATH}" +\end{lstlisting} + +This ensures that Rust tools such as \texttt{cargo} and \texttt{rustc} are available in subsequent commands. + +\paragraph{Copying Source Code} + +\begin{lstlisting} +COPY . . +\end{lstlisting} + +This instruction copies the entire project directory from the host system into the current working directory inside the container. + +\paragraph{Building the Application} + +\begin{lstlisting} +RUN cargo build --release -p mnist_infer +\end{lstlisting} + +This command compiles the Rust project in release mode. + +The options used are: + +\begin{itemize} + \item \texttt{--release}: Builds the application with compiler optimizations enabled. + \item \texttt{-p mnist\_infer}: Specifies that only the \texttt{mnist\_infer} package should be compiled. +\end{itemize} + +The generated executable is stored in: + +\begin{lstlisting} +/app/rust_ml/target/release/mnist_infer +\end{lstlisting} + +\subsubsection{Runtime Stage} + +The second stage begins with: + +\begin{lstlisting} +FROM nvidia/vulkan:1.3-470 +\end{lstlisting} + +This stage uses an NVIDIA Vulkan runtime image as the base image. The purpose of using this image is to provide Vulkan-related runtime libraries and GPU compatibility for applications that may rely on Vulkan acceleration. + +Compared to the builder image, this runtime image is significantly smaller because it does not contain compilation tools, Rust compilers, or source code. + + +\paragraph{Runtime Working Directory} + +\begin{lstlisting} +WORKDIR /app +\end{lstlisting} + +This sets the runtime working directory to: + +\begin{lstlisting} +/app +\end{lstlisting} + +All runtime files are placed relative to this location. + +\paragraph{Copying the Compiled Binary} + +\begin{lstlisting} +COPY --from=builder /app/rust_ml/target/release/mnist_infer /app/binary +\end{lstlisting} + +This instruction copies the compiled executable from the builder stage into the runtime image. + +The \texttt{--from=builder} option tells Docker to retrieve the file from the stage named \texttt{builder}. + +The binary is renamed from: + +\begin{lstlisting} +mnist_infer +\end{lstlisting} + +to: + +\begin{lstlisting} +/app/binary +\end{lstlisting} + +inside the runtime container. + +\paragraph{Copying the Model File} + +\begin{lstlisting} +COPY ./model/mnist_rust/model.mpk /app/model/mnist_rust/model.mpk +\end{lstlisting} + +This instruction copies the trained model file into the runtime container. + +The model file is stored at: + +\begin{lstlisting} +/app/model/mnist_rust/model.mpk +\end{lstlisting} + +The application can later load this file during inference. + +\paragraph{Environment Variables} + +Two environment variables are defined: + +\begin{lstlisting} +ENV RUST_LOG=info +ENV MODEL_PATH=/app/model/mnist_rust/model.mpk +\end{lstlisting} + +Their purposes are: + +\begin{itemize} + \item \texttt{RUST\_LOG=info}: Enables logging at the info level. + \item \texttt{MODEL\_PATH}: Stores the path to the trained model file. +\end{itemize} + +Using environment variables makes the application more flexible because configuration values can be changed without modifying the source code. + +\paragraph{Exposing the Application Port} + +\begin{lstlisting} +EXPOSE 9050 +\end{lstlisting} + +This instruction documents that the containerized application listens on port 9050. + +Although \texttt{EXPOSE} does not automatically publish the port to the host system, it informs users and orchestration tools such as Docker Compose or Kubernetes which port should be mapped. + +\paragraph{Container Startup Command} + +\begin{lstlisting} +CMD ["./binary"] +\end{lstlisting} + +This instruction defines the default command executed when the container starts. + +The compiled Rust binary is launched directly from the runtime working directory. + +\subsubsection{Advantages of the Dockerfile Design} + +This Dockerfile provides several important advantages: + +\begin{itemize} + \item Reduced final image size through multi-stage builds. + \item Separation of build dependencies and runtime dependencies. + \item Improved security because the runtime image does not contain compilers or source code. + \item Faster deployment due to a lightweight runtime container. + \item Better portability because the same container can run consistently across different environments. + \item Easier maintenance through the use of environment variables and explicit working directories. +\end{itemize} + +Overall, this Dockerfile is designed to efficiently package the Rust-based MNIST inference application for deployment while minimizing runtime overhead and maintaining reproducibility. + +\subsection{Python (PyTorch) Dockerfile} + +This section details the image optimization strategy implemented for the MNIST inference container. The core approach minimizes the Docker image size by decoupling the heavy machine learning dependencies (PyTorch, etc.) from the application container. Instead of baking these libraries into the image, they are stored on an external volume (NFS share) and mounted at runtime. + +\subsubsection{Dockerfile Analysis} + +The \texttt{Dockerfile} is kept intentionally lightweight. By excluding large dependencies like \texttt{torch} from the \texttt{pip install} command, the image size remains very small (only containing the base Python runtime and lightweight web frameworks). + +\vspace{0.5em} +\noindent\textbf{Listing 1: Optimized Inference Dockerfile} +\label{lst:dockerfile_inference} +\vspace{0.3em} + +\begin{lstlisting}[language=Dockerfile] +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV OMP_NUM_THREADS=1 +ENV MKL_NUM_THREADS=1 + +# Critical: Point Python to the external volume +ENV PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages + +WORKDIR /app + +# Only install lightweight app dependencies +RUN pip install --no-cache-dir --upgrade pip && \ + pip install fastapi==0.110.0 uvicorn==0.29.0 python-multipart==0.0.9 + +COPY app.py model.py model.pt ./ + +EXPOSE 8000 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] +\end{lstlisting} + +\begin{itemize} + \item \textbf{Base Image:} Uses \texttt{python:3.12-slim} to minimise the OS footprint. + \item \textbf{Environment Configuration:} + \begin{itemize} + \item \texttt{PYTHONDONTWRITEBYTECODE=1}: Prevents Python from writing \texttt{.pyc} files to disk. + + \item \textbf{\texttt{PYTHONPATH}}: Crucially set to + \path{/external-libs/ml_env/lib/python3.12/site-packages}. + This instructs the Python interpreter to look for libraries in the mounted volume directory, not just the default system paths. + \end{itemize} + \item \textbf{Minimal Dependencies:} The \texttt{pip install} command only installs \texttt{fastapi}, \texttt{uvicorn}, and \texttt{python-multipart}. Heavy ML libraries are assumed to be present in the mounted volume. +\end{itemize} + +\subsubsection{Volume Mounting Strategy} + +The strategy relies on two shell scripts to set up the environment on the host machine and run the container with the correct volume mappings. + +\paragraph{Library Setup (\texttt{mount\_libs.sh})} +This script runs on the host machine (or a VM node) to prepare the shared library volume. +\begin{enumerate} + \item \textbf{NFS Client Installation:} It installs \texttt{nfs-common} to enable Network File System capabilities. + \item \textbf{Mounting:} It connects to a remote NFS server (\texttt{172.16.203.14}) where the pre-installed ML libraries reside. + \item \textbf{Local Path:} The remote libraries are mounted to \texttt{/mnt/ml-libs} on the host. This directory acts as the bridge between the NFS server and the Docker container. +\end{enumerate} + +\paragraph{Runtime Execution (\texttt{run\_container.sh})} +This script launches the Docker container with the necessary runtime configurations to access the external libraries. + +\vspace{0.5em} +\noindent\textbf{Listing 2: Container Execution Command} +\label{lst:docker_run} +\vspace{0.3em} + +\begin{lstlisting}[language=Bash] +docker run -d \ + -v /mnt/ml-libs:/external-libs \ + -e PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + fastapi-ml-app +\end{lstlisting} + +\begin{itemize} + \item \textbf{\texttt{-v /mnt/ml-libs:/external-libs}}: This bind mount maps the host's \texttt{/mnt/ml-libs} (which contains the NFS data) to \texttt{/external-libs} inside the container. + \item \textbf{\texttt{-e PYTHONPATH=...}}: explicit environment variable override ensures the container's Python runtime finds the packages in \texttt{/external-libs}. +\end{itemize} + +\subsubsection{Benefits and Optimization} + +\begin{table*}[t!] +\centering +\caption{Optimization Benefits} +\label{tab:docker_optimization} +\begin{tabular}{|l|p{6cm}|p{6cm}|} +\hline +\textbf{Feature} & \textbf{Standard Approach} & \textbf{Volume Mount Approach} \\ \hline +\textbf{Image Size} & \textbf{Huge} ($>2GB$). Includes PyTorch, CUDA binaries, and all dependencies. & \textbf{Tiny} (~100MB). Only contains app code and minimal HTTP libs. \\ \hline +\textbf{Build Time} & \textbf{Slow}. Downloading and installing PyTorch takes minutes. & \textbf{Fast}. setup only installs \texttt{fastapi}. \\ \hline +\textbf{Updates} & requires rebuilding and pushing large layers for every code change. & Code changes only require rebuilding the tiny app layer. Library updates are handled externally. \\ \hline +\end{tabular} +\end{table*} + +This architecture allows for rapid deployment and updating of the application logic without the overhead of moving gigabytes of container layers for unchanged machine learning dependencies. + +\subsection{Rust: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model used for the experiment consisted of two convolutional layers followed by adaptive average pooling, dropout, and two fully connected layers. The complete architecture is shown below: + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 80} + conv2: Conv2d {ch_in: 8, ch_out: 16, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 1168} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, bias: true, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, bias: true, params: 5130} + activation: Relu + params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Over the course of training, both the training and validation performance improved consistently. Training accuracy increased from 81.575\% in the first epoch to 97.300\% in the final epoch, while validation accuracy improved from 92.133\% to 98.517\%. + +Similarly, the training loss decreased significantly from 0.656 to 0.087, and the validation loss reduced from 0.258 to 0.054 by the end of training. The macro F1-score also improved substantially, reaching 96.974\% for training and 98.321\% for validation. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 81.575 & 1 & 97.300 & 10 \\ +Train & Loss & 0.087 & 10 & 0.656 & 1 \\ +Train & Precision@Top1 [Macro] & 82.126 & 1 & 97.304 & 10 \\ +Train & Recall@Top1 [Macro] & 81.286 & 1 & 97.232 & 10 \\ +Train & F1-Score@Top1 [Macro] & 79.715 & 1 & 96.974 & 10 \\ +Train & Top-5 Accuracy & 97.696 & 1 & 99.969 & 10 \\ +Train & CPU Memory (GB) & 2.514 & 2 & 2.927 & 10 \\ +Train & CPU Usage (\%) & 20.753 & 5 & 30.394 & 10 \\ +\hline +Valid & Accuracy & 92.133 & 1 & 98.517 & 10 \\ +Valid & Loss & 0.054 & 10 & 0.258 & 1 \\ +Valid & Precision@Top1 [Macro] & 92.154 & 1 & 98.527 & 10 \\ +Valid & Recall@Top1 [Macro] & 91.978 & 1 & 98.425 & 10 \\ +Valid & F1-Score@Top1 [Macro] & 91.176 & 1 & 98.321 & 10 \\ +Valid & Top-5 Accuracy & 99.583 & 1 & 99.967 & 10 \\ +Valid & CPU Memory (GB) & 2.514 & 2 & 3.085 & 10 \\ +Valid & CPU Usage (\%) & 20.539 & 2 & 39.652 & 10 \\ +\hline +\end{tabular} +\label{tab:cnn_metrics_summary} +\end{table*} + +The results indicate that the model achieved strong generalization performance with minimal overfitting, as the validation accuracy remained slightly higher than the training accuracy throughout the experiment. The consistently high Top-5 accuracy values further demonstrate that the model was able to correctly identify the correct class within its top predictions. + +It should also be noted that the execution terminated with a segmentation fault after training completion. However, since the fault occurred after all epochs had been completed and metrics had already been recorded, it did not affect the validity of the training results. \\ + +The time taken to train the model is 229.916s (3min 49.916s). + + + +\subsection{Python: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model implemented in Python mirrors the Rust architecture, consisting of two convolutional layers followed by adaptive average pooling, dropout regularization, and two fully connected layers. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, kernel_size: [3, 3], stride: [1, 1]} + conv2: Conv2d {ch_in: 8, ch_out: 16, kernel_size: [3, 3], stride: [1, 1]} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Training and validation performance improved consistently over time. + +Training accuracy increased from 82.01\% to 97.29\%, while validation accuracy improved from 92.87\% to 98.20\%. +Training loss decreased significantly from 0.5947 to 0.0867, and validation loss reduced from 0.2475 to 0.0579. + +The macro F1-score reached 0.9814, demonstrating strong classification performance. Additionally, the Top-5 accuracy achieved 99.98\%, indicating highly reliable predictions. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 82.01 & 1 & 97.29 & 10 \\ +Train & Loss & 0.0867 & 10 & 0.5947 & 1 \\ +Train & Precision@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Recall@Top1 [Macro] & -- & -- & -- & -- \\ +Train & F1-Score@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Top-5 Accuracy & -- & -- & -- & -- \\ +Train & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Train & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +Valid & Accuracy & 92.87 & 1 & 98.20 & 10 \\ +Valid & Loss & 0.0579 & 10 & 0.2475 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.9816 & -- & 0.9816 & -- \\ +Valid & Recall@Top1 [Macro] & 0.9812 & -- & 0.9812 & -- \\ +Valid & F1-Score@Top1 [Macro] & 0.9814 & -- & 0.9814 & -- \\ +Valid & Top-5 Accuracy & 99.98 & -- & 99.98 & -- \\ +Valid & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Valid & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +\end{tabular} +\caption{Python Training and Validation Metrics Summary} +\end{table*} + +The results indicate strong generalization performance with no signs of overfitting. Validation accuracy remained consistently high and closely followed training accuracy. + +Training was stable with zero NaN events observed. The total training time was 182.23 seconds, with an average epoch time of 15.05 seconds and an average iteration speed of 50.58 iterations per second. + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 1.02GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\section{Task: Regression} + +\subsection{Introduction} +This document outlines the architectural, mathematical, and deployment specifics of implementing a Neural Network-based Regression model across two disparate machine learning environments: Rust (utilizing the Burn framework) and Python (utilizing PyTorch). It covers the distinct model architecture decisions, dataset handling strategies, and specialized pipeline deployment techniques leveraging Network File Systems (NFS) mapping via Docker bounds. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation} +The core mathematical foundation deployed across both frameworks is a classical Feed-Forward Neural Network consisting of a single hidden dimension mapping inputs directly onto a continuous single-variable regression output. + +For a given input feature vector $X \in \mathbb{R}^N$ (where $N$ dictates the feature size depending on the target dataset), the network's forward transformation can be represented sequentially as: +\begin{align} + Z_1 &= X \cdot W_1^T + b_1 \quad &\text{(Input Projection)} \\ + A_1 &= \max(0, Z_1) \quad &\text{(ReLU Activation)} \\ + \hat{Y} &= A_1 \cdot W_2^T + b_2 \quad &\text{(Output Projection)} +\end{align} + +Where: +\begin{itemize} + \item $W_1 \in \mathbb{R}^{H \times N}$ and $b_1 \in \mathbb{R}^H$ map the inputs onto the hidden vector space $H$. + \item $\max(0, \cdot)$ denotes the Non-Linear Rectified Linear Unit (ReLU) mapping algorithm. + \item $W_2 \in \mathbb{R}^{1 \times H}$ and $b_2 \in \mathbb{R}$ collapse the hidden abstraction onto the finalized regression scalar prediction $\hat{Y}$. +\end{itemize} + +\subsubsection{Architectural Configurations} +While the mathematical foundations are identical, implementations slightly differ based on dataset selections within the modules: +\begin{itemize} + \item \textbf{PyTorch Architecture:} Configures $N=13$ input features mapping to $H=64$ hidden parameters. + \item \textbf{Rust (Burn) Architecture:} Configures $N=8$ input features concurrently mapping to $H=64$ hidden parameters. +\end{itemize} +In both configurations, standard parameter biases (`bias=True`) are included and automatically initialized. + +\subsection{Training Pipelines} + +Both codebases train the model iteratively tracking gradients via the Adam optimizer scaled against Mean Squared Error (MSE) loss logic: +\[ \text{MSE} = \frac{1}{B} \sum_{i=1}^{B} (Y_i - \hat{Y}_i)^2 \] + +\subsubsection{PyTorch Context} +\begin{itemize} + \item \textbf{Data Loading:} Automatically pulls the \textbf{Boston Housing} dataset array (.npz file) from an external Google API via `urllib` and manually partitions it down into an explicit $80/20$ split. + \item \textbf{Telemetry Metrics:} Generates explicit hardware tracking loops inside the main epoch runner. Uses the `psutil` library to compute and stream epoch `iteration\_speed`, raw RAM consumption, and `cpu\_temp` hardware sensors parallel to the loss parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Context} +\begin{itemize} + \item \textbf{Data Loading:} Links into Huggingface's dataset registry asynchronously targeting the \textbf{California Housing} SQLized splits mapping onto memory arrays via localized `HousingDistrictItem` structs. + \item \textbf{Normalization Mapping:} Computes spatial min-max normalizations programmatically over inputs during training: + \[ X_{norm} = \frac{X - \text{min}}{\text{max} - \text{min}} \] + This logic restricts features within standard boundaries precluding exploding gradient derivations. +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} + +Deploying these isolated pipelines necessitates radically different execution strategies, highlighting Python's heavyweight runtime dependency bottlenecks versus Rust's compile-time optimizations. + +\subsubsection{PyTorch Inference Architecture} +Standard PyTorch Docker environments routinely eclipse several gigabytes due to CUDA bindings and generic scientific computation loops. To circumvent this inside microservices, the PyTorch inference pipeline mandates a hybrid Network File System (NFS) mapping architecture: +\begin{enumerate} + \item \textbf{NFS Mounting (\texttt{mount\_libs.sh}):} Installs an external `nfs-common` client locally and binds the extensive python library volume from an external dedicated storage server (`172.16.203.14`) into the host machine's `/mnt/LSTM-libs` map. + \item \textbf{Lightweight Container Image:} The backend \texttt{Dockerfile} avoids `pip install` commands completely, simply initializing a barebone `nvidia/cuda:12.1.1` image mapping Python $3.11$ system links. + \item \textbf{Volume Inject (\texttt{run\_container.sh}):} The script initializes the container enforcing `-v` flags that sync the NFS `/mnt/LSTM-libs` directory seamlessly onto the Docker's `/external-libs`. Crucially, it overrides the system \texttt{PYTHONPATH} to target those external `site-packages` at runtime. + \item \textbf{Execution:} The `FastAPI` instance loads, bypasses massive disk pulls, links the models iteratively, and fields inbound `HousingFeatures` lists continuously. +\end{enumerate} + +\subsubsection{Rust (Burn) Inference Architecture} +Rust handles Docker microservices inherently via statically linked deployments: +\begin{itemize} + \item \textbf{Multi-Stage Compiling:} Executes a build phase operating within an oversized `rust:1.92-alpine` chain, ejecting the resulting binary onto an isolated stripped `alpine:3.23` environment structure. + \item \textbf{Native Routing:} Utilizes \texttt{Axum} servers to establish the HTTP logic endpoints securely routing JSON payloads mapping to specific feature names (e.g. \texttt{median\_income}, \texttt{house\_age}). +\end{itemize} + +\subsection{Rust: Regression Model Performance Analysis} + +The regression model used in this experiment was a simple feed-forward neural network consisting of one hidden layer followed by an output layer. The model was designed to predict the median house value based on eight input features. + +\subsubsection{Model Architecture} + +The architecture of the regression model is shown below: + +\begin{lstlisting} +RegressionModel { + input_layer: Linear {d_input: 8, d_output: 64, bias: true, params: 576} + output_layer: Linear {d_input: 64, d_output: 1, bias: true, params: 65} + activation: Relu + params: 641 +} +\end{lstlisting} + +The model contains: + +\begin{itemize} + \item An input layer that maps 8 input features to 64 hidden units + \item A ReLU activation function applied after the hidden layer + \item An output layer that maps the 64 hidden units to a single scalar value +\end{itemize} + +The total number of trainable parameters in the model was only 641, making it a lightweight model suitable for fast training and inference. + +\subsubsection{Training Configuration} + +The model was trained for 100 epochs. A constant learning rate of: + +\[ +1.0 \times 10^{-3} +\] + +was used throughout the entire training process. + +\subsubsection{Training Performance} + +The training loss decreased substantially over the 100 epochs. Initially, the model started with a training loss of 3.086 during the first epoch. By the final epoch, the loss had reduced to 0.414. + +This significant reduction in loss indicates that the model successfully learned the underlying relationship between the input features and the target variable. + +\subsubsection{Validation Performance} + +Validation loss also showed a considerable improvement during training. The validation loss decreased from 4.132 in the first epoch to a minimum of 0.635 at epoch 51. + +The difference between the final training loss and the minimum validation loss suggests that the model achieved good generalization performance without severe overfitting. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary for the Regression Model} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Loss & 0.414 & 100 & 3.086 & 1 \\ +Train & Learning Rate & $1.0 \times 10^{-3}$ & 1 & $1.0 \times 10^{-3}$ & 100 \\ +Train & CPU Memory (GB) & 2.125 & 4 & 2.325 & 56 \\ +Train & CPU Usage (\%) & 19.539 & 54 & 37.989 & 11 \\ +\hline +Valid & Loss & 0.635 & 51 & 4.132 & 1 \\ +Valid & CPU Memory (GB) & 2.124 & 3 & 2.325 & 55 \\ +Valid & CPU Usage (\%) & 19.550 & 54 & 37.960 & 11 \\ +\hline +\end{tabular} +\label{tab:regression_metrics_summary} +\end{table*} + +\subsubsection{Prediction Example} + +A sample prediction generated by the model is shown below: + +\begin{lstlisting} +Predicted 2.021734 Expected 2.158 +\end{lstlisting} + +The predicted value is reasonably close to the expected value, indicating that the model was able to approximate the target variable with acceptable accuracy. + +Since the median house value was measured in units of 100,000 dollars, the prediction corresponds to: + +\begin{itemize} + \item Predicted value: approximately 202,173 dollars + \item Expected value: approximately 215,800 dollars +\end{itemize} + +\subsubsection{Predicted vs. Expected Distribution} + +The predicted-versus-expected plot suggests that the model captures the general trend in the target values, although some prediction errors remain for certain samples. + +Most of the predicted values appear concentrated around the central region of the distribution, indicating that the model performs better on common house value ranges than on extreme values. + +\subsubsection{Resource Utilization} + +The model required relatively little memory during execution. Training memory usage ranged from 2.125 GB to 2.325 GB, while validation memory usage ranged from 2.124 GB to 2.325 GB. + +CPU utilization remained moderate throughout training. Training CPU usage ranged from 19.539\% to 37.989\%, while validation CPU usage ranged from 19.550\% to 37.960\%. + +CPU temperature values were unavailable and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training and evaluation process required: + +\begin{itemize} + \item Real time: 3 minutes and 18.257 seconds + \item User CPU time: 4 minutes and 13.554 seconds + \item System CPU time: 50.340 seconds +\end{itemize} + +\subsection{Language Specific Implementation Details} + +\subsubsection{PyTorch-Specific Paradigms} +\begin{itemize} + \item \textbf{Thread Clamping:} Due to inference optimization restrictions (especially running CPU variations alongside container structures), the `app.py` enforces explicit core binding calls via `torch.set\_num\_threads(1)` and `torch.set\_num\_interop\_threads(1)` securing computational resources and restricting OS context-switching overheads. + \item \textbf{Matrix Array Verifications:} Manually inspects raw matrix vector mappings validating dimensions dynamically against numeric constraints: \texttt{len(x) != NUM\_FEATURES} triggering runtime panics before pipeline evaluations fail. + \item \textbf{Manual Hardware Moving:} The framework is heavily littered with required `.to(device)` mapping configurations switching inputs, datasets, targets, and models manually between the host and external components. +\end{itemize} + +\subsubsection{Rust (Burn)-Specific Paradigms} +\begin{itemize} + \item \textbf{Generic Compile-Time Shapes:} Dimension mappings and tensor validations are fundamentally enforced inside the Rust compiler boundaries via `` arrays indicating batches of distinct input structures mapping to `targets: Tensor`. Invalid sizes fail compilation, voiding the requirement for manual PyTorch matrix validations. + \item \textbf{Struct Batching Protocols:} Inference doesn't evaluate primitive float arrays. Intead, the API relies on executing an overarching `HousingBatcher` which transforms specific struct domains (\texttt{HousingDistrictItem}) safely into tensor primitives while executing implicit `self.normalizer.to\_device(device)` logic silently against constants behind boundaries. + \item \textbf{Record Deserialization:} States are strictly detached from models via standard `.mpk` maps. They invoke explicit \texttt{NoStdTrainingRecorder::new().load()} tracking traits unbinding memory limits inherent to standard dict serialization configurations natively loaded via `RegressionModelConfig`. +\end{itemize} + +\newpage +\subsection{Python: Regression Model Architecture and Training Performance} + +The regression model used in this experiment is a lightweight fully connected neural network with a small number of parameters (961 total). The model is optimized using mean squared error loss. + + + +The model was trained for 100 epochs. Training loss decreased significantly from 8265.55 to 69.86 (99.15\% reduction), while validation loss decreased from 9045.34 to 55.70. + +Despite strong loss reduction, the model struggled to achieve good generalization. The validation $R^2$ score remained negative (-1.07), indicating that the model performs worse than a simple baseline predictor. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 68.50 & 100 & 8265.55 & 1 \\ +Train & RMSE & 8.39 & 100 & 91.53 & 1 \\ +Train & MAE & 6.29 & 100 & 90.33 & 1 \\ +Train & R$^2$ & -96.93 & 1 & 0.1767 & 100 \\ +Train & Grad Norm (Total) & 118.80 & -- & 112876.03 & 1 \\ +Train & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Train & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +Valid & Loss & 50.93 & 65 & 9045.34 & 1 \\ +Valid & RMSE & 7.14 & 65 & 95.11 & 1 \\ +Valid & MAE & 6.00 & 65 & 93.52 & 1 \\ +Valid & R$^2$ & -335.40 & 1 & -0.8940 & 65 \\ +Valid & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Valid & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +\end{tabular} +\caption{Regression Model Training and Validation Metrics Summary} +\end{table*} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 47.39 seconds + \item Average Epoch Time: 0.225 seconds + \item Iteration Speed (Mean): 10.36 it/s + \item Gradient Norm (Mean): 7981.31 + \item NaN Events: 0 + \item Convergence: Non-monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate that while optimization was successful in reducing loss, the model lacks sufficient capacity or feature representation to generalize well. The persistently negative validation $R^2$ suggests underfitting or a mismatch between model complexity and data characteristics. + + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 973MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\section{Task: Text Classification (AG News)} +\subsection{Model Architecture and Training Strategy} + +The text classification system is built using the \texttt{Burn} framework in Rust, leveraging a Transformer-based architecture for feature extraction and a linear classification head. This section details the mathematical formulation of the model and the strategy employed for training. + +\subsubsection{Model Architecture} +The core of the model is a Transformer Encoder, which processes a sequence of token embeddings to capture contextual relationships. The architecture consists of three primary stages: embedding, encoding, and classification. + +\paragraph{Embedding Layer} +Input text is tokenized and converted into a sequence of indices $X \in \mathbb{N}^{B \times L}$, where $B$ is the batch size and $L$ is the sequence length. The model utilizes two parallel embedding layers: +\begin{enumerate} + \item \textbf{Token Embedding ($E_{tok}$)}: Maps token indices to dense vectors of dimension $d_{model}$. + \item \textbf{Positional Embedding ($E_{pos}$)}: Maps position indices $[0, \dots, L-1]$ to dense vectors of dimension $d_{model}$ to inject sequence order information. +\end{enumerate} + +The final embedding representation $E$ is obtained by averaging the token and positional embeddings: +\begin{equation} + E = \frac{E_{tok}(X) + E_{pos}(\text{positions})}{2} +\end{equation} + +\paragraph{Transformer Encoder} +The embedding tensor $E$ is passed through a multi-layer Transformer Encoder. Each layer consists of a Multi-Head Self-Attention (MHSA) mechanism followed by a Position-wise Feed-Forward Network (FFN), with residual connections and layer normalization. + +The configuration used in this implementation is as follows: +\begin{itemize} + \item \textbf{Model Dimension ($d_{model}$)}: 256 + \item \textbf{Feed-Forward Dimension ($d_{ff}$)}: 1024 + \item \textbf{Number of Heads ($N_{heads}$)}: 8 + \item \textbf{Number of Layers ($N_{layers}$)}: 4 + \item \textbf{Normalization}: Layer norm applied before sub-layers (Pre-Norm). +\end{itemize} + +Let $H = \text{TransformerEncoder}(E)$, where $H \in \mathbb{R}^{B \times L \times d_{model}}$ represents the contextualized representations of the input sequence. + +\paragraph{Classification Head} +For classification, the model utilizes the representation of the first token (typically acting as the [CLS] token) from the encoded sequence. This vector is passed through a linear layer to project it into the class space: +\begin{equation} + Y = \text{Linear}(H_{[:, 0, :]}) +\end{equation} +where $Y \in \mathbb{R}^{B \times N_{classes}}$ represents the logits. For inference, a Softmax function is applied to obtain probabilities: +\begin{equation} + \hat{P} = \text{Softmax}(Y) +\end{equation} + +\begin{table*}[t!] +\centering +\caption{Model Architecture Summary} +\label{tab:model_arch} +\begin{tabular}{|l|l|c|c|} +\hline +\textbf{Component} & \textbf{Configuration / Details} & \textbf{Input Shape} & \textbf{Output Shape} \\ \hline +Token Embedding & $V \to d_{model}$ ($V$: Vocab Size) & $(B, L)$ & $(B, L, 256)$ \\ \hline +Pos Embedding & $L_{max} \to d_{model}$ & $(B, L)$ & $(B, L, 256)$ \\ \hline +Embedding Merge & Average ($E_{tok} + E_{pos}$) & - & $(B, L, 256)$ \\ \hline +Transformer Block & 4 Layers, 8 Heads, $d_{ff}=1024$ & $(B, L, 256)$ & $(B, L, 256)$ \\ \hline +Feature Extract & Slice First Token (Index 0) & $(B, L, 256)$ & $(B, 256)$ \\ \hline +Classifier Head & Linear ($256 \to N_{classes}$) & $(B, 256)$ & $(B, N_{classes})$ \\ \hline +\end{tabular} +\end{table*} + + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/text.pdf} + + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/text.pdf} + + +\subsubsection{Training Strategy} +The model is trained using a supervised learning approach with the following configuration: + +\paragraph{Loss Function} +The training objective is to minimize the Cross-Entropy Loss between the predicted logits $Y$ and the ground truth class labels $C$: +\begin{equation} + \mathcal{L} = \text{CrossEntropy}(Y, C) = -\sum_{c=1}^{N_{classes}} \mathbb{1}_{c=C} \log\left(\frac{e^{Y_c}}{\sum_{j} e^{Y_j}}\right) +\end{equation} + +\paragraph{Optimization} +We employ the **Adam** optimizer with the following parameters: +\begin{itemize} + \item \textbf{Weight Decay}: $5 \times 10^{-5}$ + \item \textbf{Beta Coefficients}: Standard defaults (typically $\beta_1=0.9, \beta_2=0.999$) +\end{itemize} + +\paragraph{Learning Rate Scheduling} +A **Noam Learning Rate Scheduler** is used to stabilize training. The learning rate increases linearly during a warmup phase and then decays proportionally to the inverse square root of the step number. +\begin{equation} +\begin{aligned} +LR &= d_{model}^{-0.5} \cdot \min( \\ + &\quad step\_num^{-0.5}, \\ + &\quad step\_num \cdot warmup\_steps^{-1.5} +) +\end{aligned} +\end{equation} +\begin{itemize} + \item \textbf{Warmup Steps}: 1000 + \item \textbf{Base Learning Rate}: 0.01 +\end{itemize} + +\paragraph{Metrics} +During training and validation, the following metrics are tracked to monitor performance: +\begin{itemize} + \item \textbf{Loss}: Cross-Entropy Loss. + \item \textbf{Accuracy}: Percentage of correct predictions. + \item \textbf{F1-Score, Precision, Recall}: Macro-averaged metrics to account for class balance. +\end{itemize} + +\subsection{Burn Code Specifications} + +This section outlines the significant implementation details of the text classification system, focusing on the architectural choices in \texttt{model.rs} and the robust training pipeline defined in \texttt{training.rs}. +\subsubsection{Model Implementation (\texttt{model.rs})} +The \texttt{TextClassificationModel} leverages the \textbf{Burn} framework's modular design to implement a Transformer-based classifier. Key features of this implementation include: +\begin{itemize} + \item \textbf{Dual Embedding Strategy:} The model employs two distinct embedding layers: \texttt{embedding\_token} for semantic content and \texttt{embedding\_pos} for positional information. A unique characteristic of this implementation is the fusion strategy, where these embeddings are combined via averaging: + \[ + E_{final} = \frac{E_{pos} + E_{token}}{2} + \] + This differs from the standard summation approach often found in BERT implementations, potentially stabilizing the initial magnitude of the embedding vectors. + + \item \textbf{Configurable Architecture:} The system uses a \texttt{TextClassificationModelConfig} struct derived with the \texttt{Config} macro. This allows for type-safe and serializable hyperparameter management, ensuring the model architecture (hidden size, vocabulary size, sequence length) can be easily saved, loaded, and reproducible. + + \item \textbf{Masked Attention:} The forward pass actively utilizes padding masks (\texttt{mask\_pad}). These masks are passed into the \texttt{TransformerEncoderInput}, ensuring that the self-attention mechanism strictly ignores padding tokens, which is critical for handling variable-length text sequences correctly. + + \item \textbf{Separation of Train and Inference Logic:} The model explicitly implements the \texttt{TrainStep} and \texttt{InferenceStep} traits. + \begin{itemize} + \item \textbf{Training:} Returns a \texttt{ClassificationOutput} struct containing the calculated Cross-Entropy loss for backpropagation. + \item \textbf{Inference:} Returns raw probabilities by applying a softmax activation on the output logits, facilitating direct class prediction. + \end{itemize} +\end{itemize} +\subsubsection{Training Pipeline (\texttt{training.rs})} +The training module is designed for reliability and comprehensive observability. It integrates advanced optimization techniques and hardware-aware monitoring. +\begin{itemize} + \item \textbf{Noam Scheduler:} Transformer models are notoriously sensitive to learning rates. The code implements the \textbf{Noam Learning Rate Scheduler} (popularized by "Attention Is All You Need"), which features a linear warmup phase (1000 steps) followed by an inverse square root decay based on the model dimension ($d_{model}$). This prevents gradient explosions during early training stages. + + \item \textbf{Distributed Training Support:} The implementation explicitly handles distributed computing scenarios. It utilizes Rust's feature flags (\texttt{cfg[feature = "ddp"]}) to switch between single-device training and \textbf{Distributed Data Parallel (DDP)} strategies. When enabled, it employs a tree-based \texttt{AllReduceStrategy} for synchronizing gradients across multiple GPUs or nodes. + + \item \textbf{Comprehensive Telemetry:} The training loop is instrumented with an extensive suite of metrics beyond simple accuracy. It tracks: + \begin{itemize} + \item \textbf{Classification Metrics:} Macro-averaged F1-Score, Precision, and Recall, providing a holistic view of model performance on imbalanced datasets. + \item \textbf{Hardware Diagnostics:} CPU temperature, memory usage, and utilization are logged alongside training progress, aiding in the detection of thermal throttling or memory leaks during long training runs. + \end{itemize} + + \item \textbf{Efficient Data Sampling:} To manage large datasets efficiently, the loader utilizes a \texttt{SamplerDataset}. This limits the effective epoch size to 50,000 training samples and 5,000 validation samples, allowing for rapid iteration and feedback loops without needing to process the entire corpus in every epoch. +\end{itemize} + +\subsubsection{Conditional Compilation} +I think we should document a bit about this. + +\subsection{Rust Docker image} + +\subsection{Rust Inference Code} + +\subsection{Rust: Transformer-Based Text Classification Model Performance} + +The text classification model used in this experiment was based on a Transformer encoder architecture. The model consisted of token embeddings, positional embeddings, a multi-layer Transformer encoder, and a final linear classification layer. + +\subsubsection{Model Architecture} + +The architecture of the model is shown below: + +\begin{lstlisting} +TextClassificationModel { + transformer: TransformerEncoder { + d_model: 256, + d_ff: 1024, + n_heads: 8, + n_layers: 4, + dropout: 0.1, + norm_first: true, + quiet_softmax: true, + params: 3159040 + } + embedding_token: Embedding { + n_embedding: 28996, + d_model: 256, + params: 7422976 + } + embedding_pos: Embedding { + n_embedding: 256, + d_model: 256, + params: 65536 + } + output: Linear { + d_input: 256, + d_output: 4, + bias: true, + params: 1028 + } + n_classes: 4 + params: 10648580 +} +\end{lstlisting} + +The Transformer encoder used four encoder layers with eight attention heads per layer. Each layer had a model dimension of 256 and a feed-forward dimension of 1024. A dropout rate of 0.1 was used to reduce overfitting. + +The token embedding layer mapped a vocabulary of 28,996 tokens into 256-dimensional vectors. Positional embeddings of length 256 were also used so that the Transformer could capture token order information. + +The final output layer mapped the Transformer representation into four output classes. + +The total number of trainable parameters in the model was 10,648,580. + +\subsubsection{Training Configuration} + +The model was trained for a total of 5 epochs. During training, the learning rate decayed from $1.107 \times 10^{-5}$ in the first epoch to $3.733 \times 10^{-6}$ in the final epoch. + +\subsubsection{Training Performance} + +Training accuracy improved steadily from 57.968\% in the first epoch to 81.474\% in the fifth epoch. Similarly, the training loss decreased from 0.981 to 0.507. + +The macro precision, recall, and F1-score also improved significantly during training: + +\begin{itemize} + \item Precision increased from 58.893\% to 81.606\% + \item Recall increased from 57.741\% to 81.334\% + \item F1-score increased from 50.201\% to 76.001\% +\end{itemize} + +These results indicate that the model learned meaningful semantic patterns in the text data over successive epochs. + +\subsubsection{Validation Performance} + +Validation performance also improved consistently. Validation accuracy increased from 72.280\% in the first epoch to a maximum of 81.640\% in the fourth epoch. + +The validation loss decreased from 0.731 to 0.507, showing that the model generalized reasonably well to unseen data. + +The validation precision, recall, and F1-score also showed strong improvement: + +\begin{itemize} + \item Precision increased from 72.230\% to 81.739\% + \item Recall increased from 72.258\% to 82.039\% + \item F1-score increased from 65.796\% to 76.509\% +\end{itemize} + +The relatively close values between training and validation accuracy suggest that the model did not suffer from severe overfitting. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary for the Transformer-Based Text Classification Model} +% @kumar please layout sahi kar sakta hai? love you +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 57.968 & 1 & 81.474 & 5 \\ +Train & Loss & 0.507 & 5 & 0.981 & 1 \\ +Train & Precision@Top1 [Macro] & 58.893 & 1 & 81.606 & 5 \\ +Train & Recall@Top1 [Macro] & 57.741 & 1 & 81.334 & 5 \\ +Train & F1-Score@Top1 [Macro] & 50.201 & 1 & 76.001 & 5 \\ +Train & Learning Rate & $3.733 \times 10^{-6}$ & 5 & $1.107 \times 10^{-5}$ & 1 \\ +Train & CPU Memory (GB) & 2.401 & 2 & 2.736 & 5 \\ +Train & CPU Usage (\%) & 16.529 & 1 & 17.160 & 5 \\ +\hline +Valid & Accuracy & 72.280 & 1 & 81.640 & 4 \\ +Valid & Loss & 0.507 & 5 & 0.731 & 1 \\ +Valid & Precision@Top1 [Macro] & 72.230 & 1 & 81.739 & 5 \\ +Valid & Recall@Top1 [Macro] & 72.258 & 1 & 82.039 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 65.796 & 1 & 76.509 & 5 \\ +Valid & CPU Memory (GB) & 2.263 & 1 & 2.747 & 5 \\ +Valid & CPU Usage (\%) & 20.331 & 1 & 22.190 & 3 \\ +\hline +\end{tabular} +\label{tab:transformer_text_classification_metrics} +\end{table*} + +\subsubsection{Resource Utilization} + +The CPU memory usage remained relatively stable throughout training. Training memory usage ranged from 2.401 GB to 2.736 GB, while validation memory usage ranged from 2.263 GB to 2.747 GB. + +CPU utilization also remained moderate, with training CPU usage ranging from 16.529\% to 17.160\% and validation CPU usage ranging from 20.331\% to 22.190\%. + +CPU temperature values were unavailable during the experiment and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training run required: + +\begin{itemize} + \item Real time: 32 minutes and 37.872 seconds + \item User CPU time: 36 minutes and 52.313 seconds + \item System CPU time: 1 minute and 41.258 seconds +\end{itemize} + +\subsection{PyTorch Training Pipeline} +This section details the Python implementation of the text classification training pipeline. The code mimics the architecture and logic of the Rust version to ensuring comparable performance and behavior. +\subsubsection{Code Highlights} +\begin{itemize} + \item \textbf{Custom Transformer Model:} + The \texttt{TextClassificationModel} is a custom \texttt{nn.Module} containing: + \begin{itemize} + \item Dual embedding layers (\texttt{embedding\_token} and \texttt{embedding\_pos}). + \item A unique fusion strategy averaging the two embeddings: $E = (E_{pos} + E_{tok}) / 2$. + \item A standard \texttt{TransformerEncoder} stack. + \item A classification head that projects the encoded features to the 4 output classes of the AG News dataset. + \end{itemize} + \item \textbf{Noam Learning Rate Scheduler:} + A custom \texttt{NoamLR} scheduler is implemented to replicate the specific warmup and decay behavior used in the Rust implementation (and the original "Attention Is All You Need" paper). + \[ + lr = \text{factor} \cdot (d_{model}^{-0.5}) \cdot \min(step^{-0.5}, step \cdot warmup^{-1.5}) + \] + This ensures stable training dynamics for the Transformer architecture. + \item \textbf{Dataset Handling:} + The code utilizes the Hugging Face \texttt{datasets} library to load the "ag\_news" dataset. It explicitly shuffles and subsets the data (50,000 train, 5,000 test) to match the constraints applied in the Rust implementation, ensuring a fair apples-to-apples comparison between the two languages. + \item \textbf{Collate Function with Padding Masks:} + A custom \texttt{collate\_fn} handles dynamic batching. It tokenizes text using the \texttt{bert-base-cased} tokenizer and generates a boolean padding mask. Note the inversion logic: PyTorch's \texttt{TransformerEncoder} expects \texttt{True} for padded positions (unlike some other implementations where 1 implies validity), requiring careful mask generation: + \begin{lstlisting} + mask_pad = (encoding['attention_mask'] == 0) + \end{lstlisting} + \item \textbf{Training Loop:} + The training loop is a standard PyTorch implementation using \texttt{tqdm} for progress tracking. It uses \texttt{CrossEntropyLoss} as the criterion and the \texttt{Adam} optimizer. Crucially, the scheduler step is called after every batch (not every epoch), consistent with the Noam schedule requirements. +\end{itemize} + + +\subsection{Python: Text Classification (News) Transformer Model} + +The model is a Transformer-based architecture designed for multi-class news classification. It consists of a multi-layer encoder with multi-head self-attention and feedforward networks. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + transformer_encoder: { + d_model: 256, + nhead: 8, + num_layers: 4, + dim_feedforward: 1024 + } + max_seq_len: 256 + num_classes: 4 + total params: 10649092 +} +\end{lstlisting} + +The model was trained for 5 epochs and showed steady convergence across all evaluation metrics. + +Training accuracy improved from 56.19\% to 79.70\%, while validation accuracy increased from 68.14\% to 79.00\%. +Training loss decreased from 1.0145 to 0.5483, and validation loss reduced from 0.8137 to 0.5628. + +The model achieved a macro F1-score of 0.7903 on the validation/test set, indicating reasonably strong classification performance for a Transformer trained over a small number of epochs. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 56.19 & 1 & 79.70 & 5 \\ +Train & Loss & 0.5483 & 5 & 1.0145 & 1 \\ +Train & Grad Norm (Total) & 10.39 & 4 & 22.95 & 3 \\ +Train & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Train & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Train & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +Valid & Accuracy & 68.14 & 1 & 79.00 & 5 \\ +Valid & Loss & 0.5628 & 5 & 0.8137 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.7187 & 1 & 0.7942 & 5 \\ +Valid & Recall@Top1 [Macro] & 0.6815 & 1 & 0.7899 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 0.6773 & 1 & 0.7903 & 5 \\ +Valid & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Valid & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Valid & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +\end{tabular} +\caption{Transformer Text Classification Training and Validation Metrics} +\end{table*} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 706.99 seconds + \item Average Epoch Time: 139.83 seconds + \item Iteration Speed (Mean): 44.70 it/s + \item Gradient Norm (Mean): 15.75 + \item GPU Memory Usage: 178.79 MB + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate stable training and consistent improvement across epochs. While performance is lower than simpler CNN-based tasks, this is expected due to the increased complexity of natural language understanding tasks. + +\subsection{PyTorch Inference Pipeline Docker Image: Hybrid NFS and Docker Inference Architecture} + +This section details the hybrid deployment strategy designed to optimize Docker image size and leverage a centralized machine learning environment. The architecture splits the responsibilities between a \textbf{Library VM} (storage-heavy) and a \textbf{Docker VM} (compute-centric). + +\subsubsection{Architecture Overview} + +The system comprises two primary components: +\begin{enumerate} + \item \textbf{Library VM (NFS Server)}: Hosts the heavy Python environment, including PyTorch, Transformers, and CUDA libraries. This environment is exported via NFS. + \item \textbf{Docker VM (Inference Client)}: Runs a lightweight Docker container that mounts the external libraries at runtime. +\end{enumerate} + +\subsubsection{Implementation Details} + +\paragraph{1. Library Sharing via NFS} +The Library VM exports the directory containing the Python site-packages. On the Docker VM, this directory is mounted using the \texttt{mount\_libs.sh} script. + +\vspace{0.5em} +\noindent\textbf{Listing 3: Mounting the NFS Library Volume} +\label{lst:nfs_mount} +\vspace{0.3em} + +\begin{lstlisting}[language=bash] +# Configuration from mount_libs.sh +NFS_SERVER_IP="172.16.203.14" +NFS_EXPORT_PATH="/home/iiitb/Documents/textClassificationVolume" +LOCAL_MOUNT_POINT="/mnt/text-libs" + +# Mounting the remote volume +sudo mount -t nfs "$NFS_SERVER_IP:$NFS_EXPORT_PATH" "$LOCAL_MOUNT_POINT" +\end{lstlisting} + +\paragraph{2. Lightweight Docker Image} +The Docker image is built using \texttt{Dockerfile.cpu} and excludes heavy ML libraries. It only contains the application code, the model weights, and minimal system dependencies. + +\vspace{0.5em} +\noindent\textbf{Listing 4: Dockerfile.cpu Configuration} +\label{lst:dockerfile_cpu} +\vspace{0.3em} + +\begin{lstlisting}[language=Dockerfile] +FROM python:3.12-slim + +# Point Python to the external NFS mount +ENV PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages + +# Copy only the app and model +COPY app.py ./ +COPY model_pytorch_text_classification/ag_news_model.pth ./model/ + +# No 'pip install torch' is performed here! +\end{lstlisting} + +\paragraph{3. Runtime Execution} +The container is launched via \texttt{run\_inference.sh}, which mounts the NFS volume into the container at \texttt{/external-libs}. + +\vspace{0.5em} +\noindent\textbf{Listing 5: GPU-Based Container Execution Command} +\label{lst:docker_gpu_run} +\vspace{0.3em} + +\begin{lstlisting}[language=bash] +docker run --gpus all \ + -v /mnt/text-libs:/external-libs \ + -v text_model_vol:/models \ + -e PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + text_classification_image +\end{lstlisting} + +\subsubsection{Impact on Image Size} + +This architecture drastically reduces the storage footprint of the inference artifact. By decoupling the static libraries from the application logic, we achieve the following reduction: + +% \begin{table*}[t!] +% \centering +% \begin{tabular}{|l|c|c|} +% \hline +% \textbf{Component} & \textbf{Traditional Approach} & \textbf{Hybrid NFS Approach} \\ \hline +% Base Image (Python Slim) & $\sim$150 MB & $\sim$150 MB \\ \hline +% PyTorch & $\sim$3.5 GB & \textbf{0 MB (Mounted)} \\ \hline +% Transformers & $\sim$500 MB & \textbf{0 MB (Mounted)} \\ \hline +% Application Code & $<1$ MB & $<1$ MB \\ \hline +% Model Weights & $\sim$100 MB & $\sim$100 MB \\ \hline +% \textbf{Total Image Size} & \textbf{8.93 GB} & \textbf{$~250$ MB} \\ \hline +% \end{tabular} +% \caption{Comparison of Docker Image Sizes} +% \end{table*} + +% This \textbf{99.03\% reduction} in image size results in: +% \begin{itemize} +% \item Faster deployment and rollback times. +% \item Significantly lower network bandwidth usage. +% \item Efficient storage utilization on the Docker VM. +% \end{itemize} + +\subsection{Hybrid Inference Architecture with NFS and Docker} + +This section outlines the architectural design of our hybrid machine learning deployment strategy, detailing the distinct roles of the Library VM and the Docker VM, and how they interact to optimize resource usage. + +\subsubsection{Library Virtual Machine (NFS Server)} + +The \textbf{Library VM} serves as the centralized repository for the heavy components of the machine learning environment. Its primary function is to host large, static dependencies such as the Python runtime environment, deep learning frameworks (e.g., PyTorch, TensorFlow), and specialized libraries (e.g., Transformers, CUDA routines). + +By consolidating these resource-intensive libraries on a single machine, we avoid the redundancy of installing them on every inference node. This machine acts as a Network File System (NFS) server, exporting its directory structure to be accessed by other machines in the network. + +\paragraph{What is an NFS Server?} + +A \textbf{Network File System (NFS)} server is a computer that allows other machines (clients) to access its files over a network as if they were stored locally. In our architecture, the NFS server "shares" the directory containing the Python libraries. The client machines can then read these files directly, eliminating the need to physically copy the heavy libraries to each client. + +\subsubsection{Docker Virtual Machine (Inference Node)} + +The \textbf{Docker VM} is the compute-centric node responsible for executing the inference workload. It hosts the Docker engine and runs the lightweight containerized application. + +This machine does not permanently store the heavy ML libraries. instead, it mounts the shared directory from the Library VM at runtime. reliable network connectivity to the Library VM ensures that the Docker container has immediate access to the necessary software dependencies. + +\subsubsection{Hybrid Deployment Strategy} + +The hybrid strategy combines the isolation and portability of Docker with the efficiency of centralized storage. + +\begin{enumerate} + \item \textbf{Decoupling Environment and Application}: We separate the rapidly changing application code (API logic, business rules) from the slowly changing environment (Python packages). The application code resides inside the Docker image, while the environment resides on the NFS share. + \item \textbf{Runtime Linking}: When the Docker container starts, it mounts the NFS share. The container's environment variables are configured to add this mounted path to its Python path. This allows the Python interpreter inside the container to import modules (like \texttt{torch} or \texttt{transformers}) from the network share as if they were installed locally. + \item \textbf{Drastic Image Reduction}: Since the Docker image only contains the application code and minimal system dependencies, its size is reduced from several gigabytes to a few hundred megabytes. This facilitates rapid deployments, faster scaling, and reduced storage costs. +\end{enumerate} + +This architecture essentially transforms the Docker container into a lightweight "shell" that borrows its heavy "engine" from the Library VM only when needed. + +\subsubsection{Identifying the Virtual Machine Roles} + +The architecture explicitly designates two separate machines for distinct purposes. Based on the configuration scripts, their roles are defined as follows: + +\paragraph{1. The Library VM (Environment Host)} +This machine acts as the \textbf{storage backend} for the machine learning environment. +\begin{itemize} + \item \textbf{Role}: It hosts the actual Python environment (Torch, Transformers, etc.) on its local filesystem and exports it via NFS. + \item \textbf{Identifier}: In our configuration (see \texttt{mount\_libs.sh}), this machine is identified by the IP address \texttt{172.16.203.14}. + \item \textbf{Key Path}: The environment resides at \texttt{/home/iiitb/Documents/textClassificationVolume}. + \item \textbf{Action}: It does \textit{not} run the Docker container. Ideally, it simply stays online to serve files to other machines. +\end{itemize} + +\paragraph{2. The Docker VM (Inference Runner)} +This machine acts as the \textbf{compute frontend} that serves the API. +\begin{itemize} + \item \textbf{Role}: It builds and runs the lightweight Docker container. It does not have the deep learning libraries installed on its own disk; it borrows them from the Library VM. + \item \textbf{Identifier}: This is the machine where you execute the \texttt{mount\_libs.sh} and \texttt{run\_inference.sh} scripts. + \item \textbf{Key Path}: It mounts the remote library to the local path \texttt{/mnt/text-libs}. + \item \textbf{Action}: It executes the \texttt{docker run} command, effectively "bringing the code to the data" (or in this case, bringing the library data to the code container). +\end{itemize} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|l|} +\hline +\textbf{Feature} & \textbf{Library VM} & \textbf{Docker VM} \\ \hline +\textbf{Primary Function} & Storage \& NFS Server & Model Inference \& API Hosting \\ \hline +\textbf{IP Address} & \texttt{172.16.203.14} & (Assigned by Network) \\ \hline +\textbf{Python Libs} & Stored Physically on Disk & Mounted via Network (NFS) \\ \hline +\textbf{Docker Image} & Not required & Builds \& Runs Lightweight Image \\ \hline +\end{tabular} +\caption{Distinction between Library VM and Docker VM} +\end{table*} + + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 4.02 GB & $\sim$1 GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\section{Task: LSTM implementation} +\newpage + +\subsection{Introduction} +This document outlines the detailed architectural, mathematical, and translational specifics of implementing a Long Short-Term Memory (LSTM) model across two prominent machine learning environments: Rust (using the Burn framework) and Python (using PyTorch). It covers the model architecture, training pipelines, specialized deployment techniques using network filesystems (NFS) with Docker, and language-specific design implications. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation of the LSTM Cell} +The core of the model revolves around a custom, manually-implemented LSTM cell. Instead of relying on the standard un-inspectable black-box LSTM implementations provided by typical ML libraries, both codebases explicitly define the cell-level math. + +For a given timestep $t$, the input tensor $x_t$ and the previous hidden state $h_{t-1}$ are used to compute the various gates. The mathematical formulation utilized is: +\begin{align} + f_t &= \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) \quad &\text{(Forget Gate)} \\ + i_t &= \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) \quad &\text{(Input Gate)} \\ + g_t &= \tanh(W_g \cdot [h_{t-1}, x_t] + b_g) \quad &\text{(Candidate State)} \\ + o_t &= \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) \quad &\text{(Output Gate)} +\end{align} +\begin{align} + c_t &= f_t \odot c_{t-1} + i_t \odot g_t \quad &\text{(New Cell State)} \\ + h_t &= o_t \odot \tanh(c_t) \quad &\text{(New Hidden State)} +\end{align} + +Where: +\begin{itemize} + \item $\sigma$ represents the Sigmoid activation function. + \item $\tanh$ represents the Hyperbolic Tangent activation function. + \item $\odot$ denotes element-wise multiplication (Hadamard product). + \item $[h_{t-1}, x_t]$ symbolizes the concatenation of the previous hidden state and the current input. +\end{itemize} + +\subsubsection{Architectural Details} +Both implementations adhere strictly to the following architectural design: +\begin{enumerate} + \item \textbf{Layer Normalization:} Pre-activation gates, the cell state ($c_t$), and the hidden state ($h_t$) pass through separate \texttt{LayerNorm} layers. This design choice stabilizes training dynamics since the feature distributions inside the LSTM evolve at every sequence step (making standard Batch Normalization ineffective). + \item \textbf{Optimized Gate Compute:} Instead of computing 4 separate linear transformations per timestep for the features, the model employs a single combined projection that outputs a $4 \times \text{hidden\_size}$ tensor. This tensor is subsequently split into four chunks corresponding to the $i, f, g$, and $o$ gates. + \item \textbf{Bidirectional Support:} An encapsulated \texttt{StackedLstm} module stacks multiple manual LSTM layers (applying dropout between layers except on the final one). The main \texttt{LstmNetwork} integrates a forward processing stack and an optional backward processing stack (which flips the temporal dimension of the input sequence). Their respective output hidden states are concatenated along the feature dimension before passing through a fully-connected projection head. + \item \textbf{Initialization bias:} The forget-gate bias parameters are explicitly initialized to $1.0$ (via Xavier Normal parameter slicing) to prevent fatal early-training gradient decay. +\end{enumerate} + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/LSTM.pdf} + + + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/LSTM.pdf} + +\subsection{Training Pipeline} +The training behavior is intentionally synchronized to ensure parity between the languages: +\begin{itemize} + \item \textbf{Data Loading:} Operates synchronously on synthetically generated noisy sequential datasets. The validation set is scaled symmetrically relative to the training set ($20\%$ of training size). + \item \textbf{Optimization Algorithm:} Utilizes the Adam Optimizer. + \item \textbf{Loss Function:} Mean Squared Error (MSE), with reduction set to \textit{mean}. Both explicitly weigh loss accumulation during epoch passes by scaling local batch losses by the discrete batch size, averaging properly at the conclusion of the epoch. + \item \textbf{Gradient Clipping:} Ensures numerical stability on longer sequence inputs. The gradient norm is strictly clipped to $\max = 1.0$ right before the optimizer steps. + \item \textbf{Artifacts Output:} Training scripts generate an \texttt{artifact\_dir} where they store a \texttt{config.json} representation of hyperparameters, and the full state dictionary (\texttt{model.pt} in PyTorch; CompactRecorder files in Rust Burn). +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} +\subsubsection{PyTorch Inference Architecture} +A critical requirement for modern PyTorch inference deployments is resolving the massive disk footprint of CUDA-enabled PyTorch backend libraries. The PyTorch pipeline employs a sophisticated Network File System (NFS) logic to achieve a highly optimized, lightweight Dockerized inference deployment: +\begin{enumerate} + \item \textbf{External Library Mounting:} A host-level script (\texttt{mount\_libs.sh}) maps an external NAS/NFS storage partition (from \texttt{172.16.203.14}) loaded with Python environments targeting \texttt{/mnt/LSTM-libs}. + \item \textbf{Optimized Dockerfile:} The image leverages the \texttt{nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04} base image and installs basic \texttt{python3.11} runtime headers without calling \texttt{pip install torch}. Thus, the final image size is structurally negligible compared to standard ml-images. + \item \textbf{Runtime Binding:} The inference container bootloader scripts (\texttt{run\_container.sh}) bind these volume mounts (\texttt{-v \$NFS\_MOUNT\_POINT:/external-libs}) and crucially overrides the \texttt{PYTHONPATH} env-variable: + \begin{lstlisting} + -e PYTHONPATH="$CONTAINER_LIB_MOUNT/LSTM_env/lib/.../site-packages" + \end{lstlisting} + \item \textbf{Inference Execution:} \texttt{app.py} loads the model weights off an abstracted configuration path, builds a zero-gradient loader, runs inference iteratively over a single collapsed batch, and yields predictions natively. +\end{enumerate} + +\subsubsection{Rust Inference Architecture} +Rust's inference pipeline diverges significantly regarding deployment complexity due to compilation structures: +\begin{itemize} + \item \textbf{Stateless Binaries:} No containerized runtime libraries are mandated because Burn compiles statically down to heavily optimized binaries, pulling model states directly via the \texttt{CompactRecorder}. + \item \textbf{Visualization:} Results are mapped into native Polars \texttt{DataFrame} objects (\texttt{df![]}) rendering lightweight native tables detailing \textit{expected targets} versus \textit{computed predictions}. +\end{itemize} + +\subsection{Implementation Specifics} +\subsubsection{PyTorch Specific Constraints} +\begin{itemize} + \item \textbf{Dynamic computation graphing:} The \texttt{model.py} cleanly slices and chunks gates natively on tensors (e.g., \texttt{gates.chunk(4, dim=1)}). + \item \textbf{Sequence Reversals:} Done programmatically via continuous \texttt{Tensor.flip(dims=[1])} which mandates that tensors must remain contiguously stored within PyTorch internals to avoid memory reallocation overhead. + \item \textbf{Seed Setting API:} Requires deterministic locking across four sub-systems (\texttt{random, numpy, torch, torch.cuda}) to match Rust's reproducibility parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Specific Constraints} +\begin{itemize} + \item \textbf{Compile-Time Dimension Types:} Rust explicitly binds Tensor dimensionality at compile time (\texttt{Tensor} vs \texttt{Tensor}). This offers un-matched safety by forbidding invalid dimension injections that PyTorch would crash on dynamically. + \item \textbf{Trait Encapsulation:} Leverages explicit trait architectures (\texttt{\#[derive(Module, Config)]}) that automate saving hyperparameters and generating gradient backends. Burn models must be mapped cleanly from standard states to \texttt{autodiff} states. + \item \textbf{No-Mutation Logic:} State mutations generated sequentially in LSTMs are represented safely utilizing explicit tuple destructuring via \texttt{LstmState\{hidden, cell\}}, bypassing complex internal pointer tracking. + \item \textbf{Explicit Initialization Handling:} Since Burn limits orthogonal initializes out-of-the-box, Xavier Normalization was invoked explicitly, paired with \texttt{slice\_assign} tensor mappings to safely load the 1.0 uniform fill into the forget-gate components. +\end{itemize} + +\subsection{Rust: Training Loss Progression and Model Convergence} + +The model was trained for a total of 30 epochs. During training, both the training loss and validation loss decreased substantially, indicating that the model was able to learn meaningful patterns from the data. + +\subsubsection{Training Progress} + +The training process began with relatively high loss values. However, as training progressed, both the average training loss and average validation loss consistently decreased. + +The recorded loss values at different stages of training are shown below: + +\begin{table*}[t!] +\centering +\caption{Training and Validation Loss Progression} +\begin{tabular}{|c|c|c|} +\hline +\textbf{Epoch} & \textbf{Average Training Loss} & \textbf{Average Validation Loss} \\ +\hline +5 & 4456.9658 & 4473.4448 \\ +10 & 2510.1016 & 2438.3970 \\ +15 & 900.7457 & 801.6573 \\ +20 & 154.4127 & 164.8311 \\ +25 & 48.2149 & 20.1441 \\ +30 & 52.1122 & 17.5850 \\ +\hline +\end{tabular} +\label{tab:loss_progression} +\end{table*} + +\subsubsection{Loss Trend Analysis} + +The training loss decreased from 4456.9658 at epoch 5 to 52.1122 at epoch 30. Similarly, the validation loss decreased from 4473.4448 to 17.5850 over the same period. + +This large reduction in both training and validation loss suggests that the model successfully converged during training. + +Although the training loss slightly increased between epoch 25 and epoch 30, the validation loss continued to decrease. This indicates that the model continued to improve its ability to generalize to unseen data. + +The lowest validation loss achieved during the experiment was: + +\[ +17.5850 +\] + +at epoch 30. + +\subsubsection{Generalization Performance} + +The close alignment between the training loss and validation loss throughout training suggests that the model did not suffer from severe overfitting. + +In the earlier epochs, both losses were very high, which is expected because the model parameters were still being optimized. As training continued, the losses dropped rapidly, especially between epochs 10 and 25. + +This behavior indicates that the model learned most of its predictive capability during the middle phase of training. + +\subsubsection{Execution Time} + +The complete training process required: + +\begin{itemize} + \item Real time: 6 minutes and 42.185 seconds + \item User CPU time: 6 minutes and 42.414 seconds + \item System CPU time: 3.49 seconds +\end{itemize} + +The relatively low system CPU time compared to user CPU time suggests that most of the runtime was spent performing model computation rather than operating system overhead. +\subsection{Python: LSTM Model Architecture and Training Performance} + +The Long Short-Term Memory (LSTM) model consists of a 2-layer bidirectional LSTM followed by a fully connected output layer. Dropout is applied between LSTM layers to improve generalization. + +\textbf{Model Architecture:} + + +The model was trained for 30 epochs. Training and validation performance improved significantly and consistently. + +Training loss decreased from 5543.18 to 56.11 (98.99\% reduction), while validation loss decreased from 5699.26 to 48.92. +The model achieved strong regression performance, with validation RMSE reaching 6.99 and MAE reaching 4.33. + +The $R^2$ score improved from negative values to 0.9517, indicating strong predictive capability. + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 56.11 & 30 & 5543.18 & 1 \\ +Train & RMSE & 7.49 & 30 & 74.45 & 1 \\ +Train & MAE & 5.03 & 30 & 67.25 & 1 \\ +Train & R$^2$ & -4.41 & 1 & 0.9452 & 30 \\ +Train & Grad Norm (Total) & 524.61 & 1 & 4972.41 & 24 \\ +Train & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Train & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +Valid & Loss & 47.96 & 29 & 5699.26 & 1 \\ +Valid & RMSE & 6.93 & 29 & 75.49 & 1 \\ +Valid & MAE & 3.54 & 28 & 68.51 & 1 \\ +Valid & R$^2$ & -4.63 & 1 & 0.9526 & 29 \\ +Valid & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Valid & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +\end{tabular} +\caption{Extended LSTM Training and Validation Metrics Summary} +\end{table*} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 158.63 seconds + \item Average Epoch Time: 5.18 seconds + \item Iteration Speed (Mean): 6.22 it/s + \item Gradient Norm (Mean): 1394.59 + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + + + + +\newpage + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 974MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\clearpage + +\EOD +\end{document} \ No newline at end of file diff --git a/latex_reports/draft_2.tex b/latex_reports/draft_2.tex new file mode 100644 index 0000000..b92a0d1 --- /dev/null +++ b/latex_reports/draft_2.tex @@ -0,0 +1,1297 @@ +\documentclass{ieeeaccess} +\usepackage{cite} +\usepackage{amsmath,amssymb,amsfonts} +\usepackage{algorithmic} +\usepackage{textcomp} + +\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em + T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}} + +% Encoding and fonts +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{pdfpages} +\usepackage{enumitem} +\setlist{noitemsep,topsep=0pt,parsep=0pt,partopsep=0pt} + +% Math and graphics +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{array} +\usepackage{float} + +% URLs (robust line breaking) +\usepackage{url} +\usepackage[hidelinks]{hyperref} +\def\UrlBreaks{\do/\do-\do_} + +% Code listings (stable for IEEE) +\usepackage{listings} +\usepackage{xcolor} +\lstset{ + basicstyle=\ttfamily\footnotesize, + breaklines=true, + breakatwhitespace=false, % <-- IMPORTANT (change this) + columns=fullflexible, + keepspaces=true, + showstringspaces=false +} + +% Custom Dockerfile language +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]" +} + +\begin{document} +\history{Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.} +\doi{10.1109/ACCESS.2017.DOI} + +\title{System-Level Evaluation of Rust and Python for Machine Learning} +\author{\uppercase{Project Elective}\authorrefmark{1}, +\IEEEmembership{Member, IEEE}} +\address[1]{Project Elective (e-mail: project@elective.com)} +\tfootnote{This paragraph of the first footnote will contain support +information, including sponsor and financial support acknowledgment. For +example, ``This work was supported in part by the U.S. Department of +Commerce under Grant BS123456.''} + +\markboth +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} +{Project Elective \headeretal: System-Level Evaluation of Rust and Python for Machine Learning} + +\corresp{Corresponding author: Project Elective (e-mail: project@elective.com).} + +\begin{abstract} +These instructions give you guidelines for preparing papers for +IEEE Access. Use this document as a template if you are +using \LaTeX. Otherwise, use this document as an +instruction set. The electronic file of your paper will be formatted further +at IEEE. Paper titles should be written in uppercase and lowercase letters, +not all uppercase. Avoid writing long formulas with subscripts in the title; +short formulas that identify the elements are fine (e.g., "Nd--Fe--B"). Do +not write ``(Invited)'' in the title. Full names of authors are preferred in +the author field, but are not required. Put a space between authors' +initials. The abstract must be a concise yet comprehensive reflection of +what is in your article. In particular, the abstract must be self-contained, +without abbreviations, footnotes, or references. It should be a microcosm of +the full article. The abstract must be between 150--250 words. Be sure that +you adhere to these limits; otherwise, you will need to edit your abstract +accordingly. The abstract must be written as one paragraph, and should not +contain displayed mathematical equations or tabular material. The abstract +should include three or four different keywords or phrases, as this will +help readers to find it. It is important to avoid over-repetition of such +phrases as this can result in a page being rejected by search engines. +Ensure that your abstract reads well and is grammatically correct. +\end{abstract} + +\begin{keywords} +Enter key words or phrases in alphabetical +order, separated by commas. For a list of suggested keywords, send a blank +e-mail to keywords@ieee.org or visit \underline +{http://www.ieee.org/organizations/pubs/ani\_prod/keywrd98.txt} +\end{keywords} + +\titlepgskip=-15pt + +\maketitle +\section{Overview of the Project} + +This project studies the use of \textbf{Rust} as an alternative systems language for machine learning workflows traditionally implemented in \textbf{Python}. +Rather than focusing on state-of-the-art model performance, the emphasis is on: + +\begin{itemize} + \item feasibility of end-to-end ML workflows, + \item system stability and reproducibility, + \item developer experience and DevOps complexity, + \item deployment and operational characteristics. +\end{itemize} + +To ensure clarity and rigor, the work is organized into \textbf{two clearly separated experimental tracks}. + +\hrule + +\section{Project Structure: Two-Track Evaluation} + +The project consists of the following two tracks: + +\subsection*{Track 1: Training-Based Systems Evaluation} +This track compares \textbf{machine learning training pipelines} implemented in: +\begin{itemize} + \item PyTorch (Python), and + \item Burn (Rust). +\end{itemize} + +The goal is to evaluate training feasibility, stability, compile-time guarantees, and DevOps impact, rather than raw training speed. + +\subsection*{Track 2: Inference-Based DevOps Evaluation} +This track compares \textbf{production-style inference services} implemented in: +\begin{itemize} + \item Python-based ONNX inference, and + \item Rust-based ONNX inference. +\end{itemize} + +The focus is on deployment, security, containerization, CI/CD behavior, and runtime efficiency. + +Each track is designed to answer a distinct research question while remaining complementary. + +\section{Machine Learning Tasks Considered} + +To ensure coverage of diverse ML workloads, the following tasks are identified: + +\begin{itemize} + \item \textbf{Text Classification}: Dataset to be finalized. + \item \textbf{Image Classification}: MNIST dataset. + \item \textbf{Credit Score Assignment}: Supervised classification task. + \item \textbf{Multi-Objective Machine Learning}: Brain Tumor dataset with a MOML formulation. + \item \textbf{Fine-Tuning Task}: BERT-based classification (ANLP Assignment 1), with optional LoRA / QLoRA. + \item \textbf{Autoregressive Decoding}: Experiments using the Burn framework. +\end{itemize} + +At the current stage, the \textbf{MNIST image classification task has been fully implemented}. +The corresponding training code is available in the project GitHub repository. + +\hrule + +\section{Related Work} + +The following research papers are being used to guide experimental design and evaluation: + +\begin{itemize} + \item \url{https://ieeexplore.ieee.org/document/11126113} + \item \url{https://ieeexplore.ieee.org/document/11261485} + \item \url{https://ieeexplore.ieee.org/document/11212348} + \item \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} +\end{itemize} + +\hrule + +\section{Code Repository and Current Status} + +Project repository: +\begin{center} +\url{https://github.com/Abhinav-Kumar012/Rust_Python_ML_PE.git} +\end{center} + +Current progress includes: +\begin{itemize} + \item MNIST training pipeline implemented + \item PyTorch baseline established + \item Initial Rust (Burn) training setup completed +\end{itemize} + +\hrule + +\section{Track 1: Training-Based Systems Evaluation} + +\subsection{Objective} + +The objective of this track is to answer the following research question: + +\begin{quote} +\textit{Can Rust realistically support end-to-end machine learning training pipelines, and what system-level trade-offs does this introduce compared to PyTorch?} +\end{quote} + +This track explicitly avoids speed-centric benchmarking and instead focuses on system behavior. + +\hrule + +\subsection{Frameworks Compared} + +\subsubsection{PyTorch (Baseline)} +\begin{itemize} + \item Language: Python + \item Training maturity: Very high + \item Ecosystem: Extensive +\end{itemize} + +\subsubsection{Rust (Burn)} +\begin{itemize} + \item Language: Rust + \item Training maturity: Emerging + \item Design: Idiomatic Rust, native training support +\end{itemize} + +\hrule + +\subsection{Experimental Controls} + +\textbf{Fixed Across Both Implementations} +\begin{itemize} + \item Dataset splits + \item Number of epochs + \item Batch size + \item Optimizer type + \item Learning rate + \item Hardware +\end{itemize} + +\textbf{Allowed Differences} +\begin{itemize} + \item Internal kernel implementations + \item Graph execution model + \item Memory management +\end{itemize} + +\hrule + +\subsection{Metrics Collected} + +\begin{itemize} + \item Training time per epoch (reported cautiously) + \item Loss curves and convergence behavior + \item Runtime failures and numerical stability + \item Reproducibility across runs + \item Environment setup and build complexity + \item Dependency footprint and artifact size +\end{itemize} + +\hrule + +\section{Track 2: Inference-Based DevOps Evaluation} + +\subsection{Objective} + +The objective of this track is to compare \textbf{deployment, security, and operational characteristics} of Python-based and Rust-based ML inference services executing the same ONNX model. + +\hrule + +\subsection{Inference Services Compared} + +\textbf{Python Service} +\begin{itemize} + \item FastAPI + Uvicorn + \item ONNX Runtime (Python) +\end{itemize} + +\textbf{Rust Service} +\begin{itemize} + \item Axum / Actix + \item burn-rs +\end{itemize} + +Both services expose identical inference endpoints and return identical outputs. + +\hrule + +\subsection{Evaluation Dimensions} + +\begin{itemize} + \item CI/CD build behavior + \item Container image size and layering + \item Cold-start latency + \item Inference latency and throughput + \item Resource utilization + \item Security and supply-chain surface +\end{itemize} + +\hrule + +\section{Upcoming Work} + +The following tasks are planned for the next phase of the project: +s +\begin{itemize} + \item Develop production-style inference services for both Python and Rust. + \item Write Dockerfiles for Python and Rust inference services. + \item Set up Jenkins-based CI pipelines for inference, including build, test, containerization, and security scanning. +\end{itemize} + +\section{Task: MNIST Image Classification} +\subsection{Architecture Details} + + + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{RustLocust/MNIST.pdf} +\caption{Rust Backend Load Testing Dashboard for MNIST} +\end{figure} + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{PythonLocust/MNIST.pdf} +\caption{Python Backend Load Testing Dashboard for MNIST} +\end{figure} + +\subsection{Docker Containerization Strategy} +Both Rust and Python inference workflows leverage highly optimized container strategies. For Rust, a multi-stage Docker build compiles the application within an \texttt{ubuntu:16.04} builder and transfers the standalone binary to a minimal \texttt{nvidia/vulkan:1.3-470} runtime image. + + + + +The Python (PyTorch) container minimizes footprint by avoiding framework installation inside the image. Utilizing \texttt{python:3.12-slim}, it only installs \texttt{fastapi} and maps heavy ML dependencies at runtime via an external NFS volume (\texttt{PYTHONPATH} override). This reduces the image size from gigabytes to under 150MB, drastically accelerating deployments. + + +\subsection{Rust: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model used for the experiment consisted of two convolutional layers followed by adaptive average pooling, dropout, and two fully connected layers. The complete architecture is shown below: + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 80} + conv2: Conv2d {ch_in: 8, ch_out: 16, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 1168} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, bias: true, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, bias: true, params: 5130} + activation: Relu + params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Over the course of training, both the training and validation performance improved consistently. Training accuracy increased from 81.575\% in the first epoch to 97.300\% in the final epoch, while validation accuracy improved from 92.133\% to 98.517\%. + +Similarly, the training loss decreased significantly from 0.656 to 0.087, and the validation loss reduced from 0.258 to 0.054 by the end of training. The macro F1-score also improved substantially, reaching 96.974\% for training and 98.321\% for validation. + + + +The results indicate that the model achieved strong generalization performance with minimal overfitting, as the validation accuracy remained slightly higher than the training accuracy throughout the experiment. The consistently high Top-5 accuracy values further demonstrate that the model was able to correctly identify the correct class within its top predictions. + +It should also be noted that the execution terminated with a segmentation fault after training completion. However, since the fault occurred after all epochs had been completed and metrics had already been recorded, it did not affect the validity of the training results. \\ + +The time taken to train the model is 229.916s (3min 49.916s). + + + +\subsection{Python: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model implemented in Python mirrors the Rust architecture, consisting of two convolutional layers followed by adaptive average pooling, dropout regularization, and two fully connected layers. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, kernel_size: [3, 3], stride: [1, 1]} + conv2: Conv2d {ch_in: 8, ch_out: 16, kernel_size: [3, 3], stride: [1, 1]} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{lstlisting} + +The model was trained for 10 epochs. Training and validation performance improved consistently over time. + +Training accuracy increased from 82.01\% to 97.29\%, while validation accuracy improved from 92.87\% to 98.20\%. +Training loss decreased significantly from 0.5947 to 0.0867, and validation loss reduced from 0.2475 to 0.0579. + +The macro F1-score reached 0.9814, demonstrating strong classification performance. Additionally, the Top-5 accuracy achieved 99.98\%, indicating highly reliable predictions. + + + +The results indicate strong generalization performance with no signs of overfitting. Validation accuracy remained consistently high and closely followed training accuracy. + +Training was stable with zero NaN events observed. The total training time was 182.23 seconds, with an average epoch time of 15.05 seconds and an average iteration speed of 50.58 iterations per second. + + + +\section{Task: Regression} + +\subsection{Introduction} +This document outlines the architectural, mathematical, and deployment specifics of implementing a Neural Network-based Regression model across two disparate machine learning environments: Rust (utilizing the Burn framework) and Python (utilizing PyTorch). It covers the distinct model architecture decisions, dataset handling strategies, and specialized pipeline deployment techniques leveraging Network File Systems (NFS) mapping via Docker bounds. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation} +The core mathematical foundation deployed across both frameworks is a classical Feed-Forward Neural Network consisting of a single hidden dimension mapping inputs directly onto a continuous single-variable regression output. + +For a given input feature vector $X \in \mathbb{R}^N$ (where $N$ dictates the feature size depending on the target dataset), the network's forward transformation can be represented sequentially as: +\begin{align} + Z_1 &= X \cdot W_1^T + b_1 \quad &\text{(Input Projection)} \\ + A_1 &= \max(0, Z_1) \quad &\text{(ReLU Activation)} \\ + \hat{Y} &= A_1 \cdot W_2^T + b_2 \quad &\text{(Output Projection)} +\end{align} + +Where: +\begin{itemize} + \item $W_1 \in \mathbb{R}^{H \times N}$ and $b_1 \in \mathbb{R}^H$ map the inputs onto the hidden vector space $H$. + \item $\max(0, \cdot)$ denotes the Non-Linear Rectified Linear Unit (ReLU) mapping algorithm. + \item $W_2 \in \mathbb{R}^{1 \times H}$ and $b_2 \in \mathbb{R}$ collapse the hidden abstraction onto the finalized regression scalar prediction $\hat{Y}$. +\end{itemize} + +\subsubsection{Architectural Configurations} +While the mathematical foundations are identical, implementations slightly differ based on dataset selections within the modules: +\begin{itemize} + \item \textbf{PyTorch Architecture:} Configures $N=13$ input features mapping to $H=64$ hidden parameters. + \item \textbf{Rust (Burn) Architecture:} Configures $N=8$ input features concurrently mapping to $H=64$ hidden parameters. +\end{itemize} +In both configurations, standard parameter biases (`bias=True`) are included and automatically initialized. + +\subsection{Training Pipelines} + +Both codebases train the model iteratively tracking gradients via the Adam optimizer scaled against Mean Squared Error (MSE) loss logic: +\[ \text{MSE} = \frac{1}{B} \sum_{i=1}^{B} (Y_i - \hat{Y}_i)^2 \] + +\subsubsection{PyTorch Context} +\begin{itemize} + \item \textbf{Data Loading:} Automatically pulls the \textbf{Boston Housing} dataset array (.npz file) from an external Google API via `urllib` and manually partitions it down into an explicit $80/20$ split. + \item \textbf{Telemetry Metrics:} Generates explicit hardware tracking loops inside the main epoch runner. Uses the `psutil` library to compute and stream epoch `iteration\_speed`, raw RAM consumption, and `cpu\_temp` hardware sensors parallel to the loss parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Context} +\begin{itemize} + \item \textbf{Data Loading:} Links into Huggingface's dataset registry asynchronously targeting the \textbf{California Housing} SQLized splits mapping onto memory arrays via localized `HousingDistrictItem` structs. + \item \textbf{Normalization Mapping:} Computes spatial min-max normalizations programmatically over inputs during training: + \[ X_{norm} = \frac{X - \text{min}}{\text{max} - \text{min}} \] + This logic restricts features within standard boundaries precluding exploding gradient derivations. +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} + +Deploying these isolated pipelines necessitates radically different execution strategies, highlighting Python's heavyweight runtime dependency bottlenecks versus Rust's compile-time optimizations. + +\subsubsection{PyTorch Inference Architecture} +Standard PyTorch Docker environments routinely eclipse several gigabytes due to CUDA bindings and generic scientific computation loops. To circumvent this inside microservices, the PyTorch inference pipeline mandates a hybrid Network File System (NFS) mapping architecture: +\begin{enumerate} + \item \textbf{NFS Mounting (\texttt{mount\_libs.sh}):} Installs an external `nfs-common` client locally and binds the extensive python library volume from an external dedicated storage server (`172.16.203.14`) into the host machine's `/mnt/LSTM-libs` map. + \item \textbf{Lightweight Container Image:} The backend \texttt{Dockerfile} avoids `pip install` commands completely, simply initializing a barebone `nvidia/cuda:12.1.1` image mapping Python $3.11$ system links. + \item \textbf{Volume Inject (\texttt{run\_container.sh}):} The script initializes the container enforcing `-v` flags that sync the NFS `/mnt/LSTM-libs` directory seamlessly onto the Docker's `/external-libs`. Crucially, it overrides the system \texttt{PYTHONPATH} to target those external `site-packages` at runtime. + \item \textbf{Execution:} The `FastAPI` instance loads, bypasses massive disk pulls, links the models iteratively, and fields inbound `HousingFeatures` lists continuously. +\end{enumerate} + +\subsubsection{Rust (Burn) Inference Architecture} +Rust handles Docker microservices inherently via statically linked deployments: +\begin{itemize} + \item \textbf{Multi-Stage Compiling:} Executes a build phase operating within an oversized `rust:1.92-alpine` chain, ejecting the resulting binary onto an isolated stripped `alpine:3.23` environment structure. + \item \textbf{Native Routing:} Utilizes \texttt{Axum} servers to establish the HTTP logic endpoints securely routing JSON payloads mapping to specific feature names (e.g. \texttt{median\_income}, \texttt{house\_age}). +\end{itemize} + +\subsection{Rust: Regression Model Performance Analysis} + +The regression model used in this experiment was a simple feed-forward neural network consisting of one hidden layer followed by an output layer. The model was designed to predict the median house value based on eight input features. + +\subsubsection{Model Architecture} + +The architecture of the regression model is shown below: + +\begin{lstlisting} +RegressionModel { + input_layer: Linear {d_input: 8, d_output: 64, bias: true, params: 576} + output_layer: Linear {d_input: 64, d_output: 1, bias: true, params: 65} + activation: Relu + params: 641 +} +\end{lstlisting} + +The model contains: + +\begin{itemize} + \item An input layer that maps 8 input features to 64 hidden units + \item A ReLU activation function applied after the hidden layer + \item An output layer that maps the 64 hidden units to a single scalar value +\end{itemize} + +The total number of trainable parameters in the model was only 641, making it a lightweight model suitable for fast training and inference. + +\subsubsection{Training Configuration} + +The model was trained for 100 epochs. A constant learning rate of: + +\[ +1.0 \times 10^{-3} +\] + +was used throughout the entire training process. + +\subsubsection{Training Performance} + +The training loss decreased substantially over the 100 epochs. Initially, the model started with a training loss of 3.086 during the first epoch. By the final epoch, the loss had reduced to 0.414. + +This significant reduction in loss indicates that the model successfully learned the underlying relationship between the input features and the target variable. + +\subsubsection{Validation Performance} + +Validation loss also showed a considerable improvement during training. The validation loss decreased from 4.132 in the first epoch to a minimum of 0.635 at epoch 51. + +The difference between the final training loss and the minimum validation loss suggests that the model achieved good generalization performance without severe overfitting. + + + +\subsubsection{Prediction Example} + +A sample prediction generated by the model is shown below: + +\begin{lstlisting} +Predicted 2.021734 Expected 2.158 +\end{lstlisting} + +The predicted value is reasonably close to the expected value, indicating that the model was able to approximate the target variable with acceptable accuracy. + +Since the median house value was measured in units of 100,000 dollars, the prediction corresponds to: + +\begin{itemize} + \item Predicted value: approximately 202,173 dollars + \item Expected value: approximately 215,800 dollars +\end{itemize} + +\subsubsection{Predicted vs. Expected Distribution} + +The predicted-versus-expected plot suggests that the model captures the general trend in the target values, although some prediction errors remain for certain samples. + +Most of the predicted values appear concentrated around the central region of the distribution, indicating that the model performs better on common house value ranges than on extreme values. + +\subsubsection{Resource Utilization} + +The model required relatively little memory during execution. Training memory usage ranged from 2.125 GB to 2.325 GB, while validation memory usage ranged from 2.124 GB to 2.325 GB. + +CPU utilization remained moderate throughout training. Training CPU usage ranged from 19.539\% to 37.989\%, while validation CPU usage ranged from 19.550\% to 37.960\%. + +CPU temperature values were unavailable and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training and evaluation process required: + +\begin{itemize} + \item Real time: 3 minutes and 18.257 seconds + \item User CPU time: 4 minutes and 13.554 seconds + \item System CPU time: 50.340 seconds +\end{itemize} + +\subsection{Language Specific Implementation Details} + +\subsubsection{PyTorch-Specific Paradigms} +\begin{itemize} + \item \textbf{Thread Clamping:} Due to inference optimization restrictions (especially running CPU variations alongside container structures), the `app.py` enforces explicit core binding calls via `torch.set\_num\_threads(1)` and `torch.set\_num\_interop\_threads(1)` securing computational resources and restricting OS context-switching overheads. + \item \textbf{Matrix Array Verifications:} Manually inspects raw matrix vector mappings validating dimensions dynamically against numeric constraints: \texttt{len(x) != NUM\_FEATURES} triggering runtime panics before pipeline evaluations fail. + \item \textbf{Manual Hardware Moving:} The framework is heavily littered with required `.to(device)` mapping configurations switching inputs, datasets, targets, and models manually between the host and external components. +\end{itemize} + +\subsubsection{Rust (Burn)-Specific Paradigms} +\begin{itemize} + \item \textbf{Generic Compile-Time Shapes:} Dimension mappings and tensor validations are fundamentally enforced inside the Rust compiler boundaries via `` arrays indicating batches of distinct input structures mapping to `targets: Tensor`. Invalid sizes fail compilation, voiding the requirement for manual PyTorch matrix validations. + \item \textbf{Struct Batching Protocols:} Inference doesn't evaluate primitive float arrays. Intead, the API relies on executing an overarching `HousingBatcher` which transforms specific struct domains (\texttt{HousingDistrictItem}) safely into tensor primitives while executing implicit `self.normalizer.to\_device(device)` logic silently against constants behind boundaries. + \item \textbf{Record Deserialization:} States are strictly detached from models via standard `.mpk` maps. They invoke explicit \texttt{NoStdTrainingRecorder::new().load()} tracking traits unbinding memory limits inherent to standard dict serialization configurations natively loaded via `RegressionModelConfig`. +\end{itemize} + +\newpage +\subsection{Python: Regression Model Architecture and Training Performance} + +The regression model used in this experiment is a lightweight fully connected neural network with a small number of parameters (961 total). The model is optimized using mean squared error loss. + + + +The model was trained for 100 epochs. Training loss decreased significantly from 8265.55 to 69.86 (99.15\% reduction), while validation loss decreased from 9045.34 to 55.70. + +Despite strong loss reduction, the model struggled to achieve good generalization. The validation $R^2$ score remained negative (-1.07), indicating that the model performs worse than a simple baseline predictor. + + + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 47.39 seconds + \item Average Epoch Time: 0.225 seconds + \item Iteration Speed (Mean): 10.36 it/s + \item Gradient Norm (Mean): 7981.31 + \item NaN Events: 0 + \item Convergence: Non-monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate that while optimization was successful in reducing loss, the model lacks sufficient capacity or feature representation to generalize well. The persistently negative validation $R^2$ suggests underfitting or a mismatch between model complexity and data characteristics. + + + + +\section{Task: Text Classification (AG News)} +\subsection{Model Architecture and Training Strategy} + +The text classification system is built using the \texttt{Burn} framework in Rust, leveraging a Transformer-based architecture for feature extraction and a linear classification head. This section details the mathematical formulation of the model and the strategy employed for training. + +\subsubsection{Model Architecture} +The core of the model is a Transformer Encoder, which processes a sequence of token embeddings to capture contextual relationships. The architecture consists of three primary stages: embedding, encoding, and classification. + +\paragraph{Embedding Layer} +Input text is tokenized and converted into a sequence of indices $X \in \mathbb{N}^{B \times L}$, where $B$ is the batch size and $L$ is the sequence length. The model utilizes two parallel embedding layers: +\begin{enumerate} + \item \textbf{Token Embedding ($E_{tok}$)}: Maps token indices to dense vectors of dimension $d_{model}$. + \item \textbf{Positional Embedding ($E_{pos}$)}: Maps position indices $[0, \dots, L-1]$ to dense vectors of dimension $d_{model}$ to inject sequence order information. +\end{enumerate} + +The final embedding representation $E$ is obtained by averaging the token and positional embeddings: +\begin{equation} + E = \frac{E_{tok}(X) + E_{pos}(\text{positions})}{2} +\end{equation} + +\paragraph{Transformer Encoder} +The embedding tensor $E$ is passed through a multi-layer Transformer Encoder. Each layer consists of a Multi-Head Self-Attention (MHSA) mechanism followed by a Position-wise Feed-Forward Network (FFN), with residual connections and layer normalization. + +The configuration used in this implementation is as follows: +\begin{itemize} + \item \textbf{Model Dimension ($d_{model}$)}: 256 + \item \textbf{Feed-Forward Dimension ($d_{ff}$)}: 1024 + \item \textbf{Number of Heads ($N_{heads}$)}: 8 + \item \textbf{Number of Layers ($N_{layers}$)}: 4 + \item \textbf{Normalization}: Layer norm applied before sub-layers (Pre-Norm). +\end{itemize} + +Let $H = \text{TransformerEncoder}(E)$, where $H \in \mathbb{R}^{B \times L \times d_{model}}$ represents the contextualized representations of the input sequence. + +\paragraph{Classification Head} +For classification, the model utilizes the representation of the first token (typically acting as the [CLS] token) from the encoded sequence. This vector is passed through a linear layer to project it into the class space: +\begin{equation} + Y = \text{Linear}(H_{[:, 0, :]}) +\end{equation} +where $Y \in \mathbb{R}^{B \times N_{classes}}$ represents the logits. For inference, a Softmax function is applied to obtain probabilities: +\begin{equation} + \hat{P} = \text{Softmax}(Y) +\end{equation} + +\begin{table*}[t!] +\centering +\caption{Model Architecture Summary} +\label{tab:model_arch} +\begin{tabular}{|l|l|c|c|} +\hline +\textbf{Component} & \textbf{Configuration / Details} & \textbf{Input Shape} & \textbf{Output Shape} \\ \hline +Token Embedding & $V \to d_{model}$ ($V$: Vocab Size) & $(B, L)$ & $(B, L, 256)$ \\ \hline +Pos Embedding & $L_{max} \to d_{model}$ & $(B, L)$ & $(B, L, 256)$ \\ \hline +Embedding Merge & Average ($E_{tok} + E_{pos}$) & - & $(B, L, 256)$ \\ \hline +Transformer Block & 4 Layers, 8 Heads, $d_{ff}=1024$ & $(B, L, 256)$ & $(B, L, 256)$ \\ \hline +Feature Extract & Slice First Token (Index 0) & $(B, L, 256)$ & $(B, 256)$ \\ \hline +Classifier Head & Linear ($256 \to N_{classes}$) & $(B, 256)$ & $(B, N_{classes})$ \\ \hline +\end{tabular} +\end{table*} + + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{RustLocust/text.pdf} +\caption{Rust Backend Load Testing Dashboard for Text Classification} +\end{figure} + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{PythonLocust/text.pdf} +\caption{Python Backend Load Testing Dashboard for Text Classification} +\end{figure} + +\subsubsection{Training Strategy} +The model is trained using a supervised learning approach with the following configuration: + +\paragraph{Loss Function} +The training objective is to minimize the Cross-Entropy Loss between the predicted logits $Y$ and the ground truth class labels $C$: +\begin{equation} + \mathcal{L} = \text{CrossEntropy}(Y, C) = -\sum_{c=1}^{N_{classes}} \mathbb{1}_{c=C} \log\left(\frac{e^{Y_c}}{\sum_{j} e^{Y_j}}\right) +\end{equation} + +\paragraph{Optimization} +We employ the **Adam** optimizer with the following parameters: +\begin{itemize} + \item \textbf{Weight Decay}: $5 \times 10^{-5}$ + \item \textbf{Beta Coefficients}: Standard defaults (typically $\beta_1=0.9, \beta_2=0.999$) +\end{itemize} + +\paragraph{Learning Rate Scheduling} +A **Noam Learning Rate Scheduler** is used to stabilize training. The learning rate increases linearly during a warmup phase and then decays proportionally to the inverse square root of the step number. +\begin{equation} +\begin{aligned} +LR &= d_{model}^{-0.5} \cdot \min( \\ + &\quad step\_num^{-0.5}, \\ + &\quad step\_num \cdot warmup\_steps^{-1.5} +) +\end{aligned} +\end{equation} +\begin{itemize} + \item \textbf{Warmup Steps}: 1000 + \item \textbf{Base Learning Rate}: 0.01 +\end{itemize} + +\paragraph{Metrics} +During training and validation, the following metrics are tracked to monitor performance: +\begin{itemize} + \item \textbf{Loss}: Cross-Entropy Loss. + \item \textbf{Accuracy}: Percentage of correct predictions. + \item \textbf{F1-Score, Precision, Recall}: Macro-averaged metrics to account for class balance. +\end{itemize} + +\subsection{Burn Code Specifications} + +This section outlines the significant implementation details of the text classification system, focusing on the architectural choices in \texttt{model.rs} and the robust training pipeline defined in \texttt{training.rs}. +\subsubsection{Model Implementation (\texttt{model.rs})} +The \texttt{TextClassificationModel} leverages the \textbf{Burn} framework's modular design to implement a Transformer-based classifier. Key features of this implementation include: +\begin{itemize} + \item \textbf{Dual Embedding Strategy:} The model employs two distinct embedding layers: \texttt{embedding\_token} for semantic content and \texttt{embedding\_pos} for positional information. A unique characteristic of this implementation is the fusion strategy, where these embeddings are combined via averaging: + \[ + E_{final} = \frac{E_{pos} + E_{token}}{2} + \] + This differs from the standard summation approach often found in BERT implementations, potentially stabilizing the initial magnitude of the embedding vectors. + + \item \textbf{Configurable Architecture:} The system uses a \texttt{TextClassificationModelConfig} struct derived with the \texttt{Config} macro. This allows for type-safe and serializable hyperparameter management, ensuring the model architecture (hidden size, vocabulary size, sequence length) can be easily saved, loaded, and reproducible. + + \item \textbf{Masked Attention:} The forward pass actively utilizes padding masks (\texttt{mask\_pad}). These masks are passed into the \texttt{TransformerEncoderInput}, ensuring that the self-attention mechanism strictly ignores padding tokens, which is critical for handling variable-length text sequences correctly. + + \item \textbf{Separation of Train and Inference Logic:} The model explicitly implements the \texttt{TrainStep} and \texttt{InferenceStep} traits. + \begin{itemize} + \item \textbf{Training:} Returns a \texttt{ClassificationOutput} struct containing the calculated Cross-Entropy loss for backpropagation. + \item \textbf{Inference:} Returns raw probabilities by applying a softmax activation on the output logits, facilitating direct class prediction. + \end{itemize} +\end{itemize} +\subsubsection{Training Pipeline (\texttt{training.rs})} +The training module is designed for reliability and comprehensive observability. It integrates advanced optimization techniques and hardware-aware monitoring. +\begin{itemize} + \item \textbf{Noam Scheduler:} Transformer models are notoriously sensitive to learning rates. The code implements the \textbf{Noam Learning Rate Scheduler} (popularized by "Attention Is All You Need"), which features a linear warmup phase (1000 steps) followed by an inverse square root decay based on the model dimension ($d_{model}$). This prevents gradient explosions during early training stages. + + \item \textbf{Distributed Training Support:} The implementation explicitly handles distributed computing scenarios. It utilizes Rust's feature flags (\texttt{cfg[feature = "ddp"]}) to switch between single-device training and \textbf{Distributed Data Parallel (DDP)} strategies. When enabled, it employs a tree-based \texttt{AllReduceStrategy} for synchronizing gradients across multiple GPUs or nodes. + + \item \textbf{Comprehensive Telemetry:} The training loop is instrumented with an extensive suite of metrics beyond simple accuracy. It tracks: + \begin{itemize} + \item \textbf{Classification Metrics:} Macro-averaged F1-Score, Precision, and Recall, providing a holistic view of model performance on imbalanced datasets. + \item \textbf{Hardware Diagnostics:} CPU temperature, memory usage, and utilization are logged alongside training progress, aiding in the detection of thermal throttling or memory leaks during long training runs. + \end{itemize} + + \item \textbf{Efficient Data Sampling:} To manage large datasets efficiently, the loader utilizes a \texttt{SamplerDataset}. This limits the effective epoch size to 50,000 training samples and 5,000 validation samples, allowing for rapid iteration and feedback loops without needing to process the entire corpus in every epoch. +\end{itemize} + +\subsection{Rust: Transformer-Based Text Classification Model Performance} + +The text classification model used in this experiment was based on a Transformer encoder architecture. The model consisted of token embeddings, positional embeddings, a multi-layer Transformer encoder, and a final linear classification layer. + +\subsubsection{Model Architecture} + +The architecture of the model is shown below: + +\begin{lstlisting} +TextClassificationModel { + transformer: TransformerEncoder { + d_model: 256, + d_ff: 1024, + n_heads: 8, + n_layers: 4, + dropout: 0.1, + norm_first: true, + quiet_softmax: true, + params: 3159040 + } + embedding_token: Embedding { + n_embedding: 28996, + d_model: 256, + params: 7422976 + } + embedding_pos: Embedding { + n_embedding: 256, + d_model: 256, + params: 65536 + } + output: Linear { + d_input: 256, + d_output: 4, + bias: true, + params: 1028 + } + n_classes: 4 + params: 10648580 +} +\end{lstlisting} + +The Transformer encoder used four encoder layers with eight attention heads per layer. Each layer had a model dimension of 256 and a feed-forward dimension of 1024. A dropout rate of 0.1 was used to reduce overfitting. + +The token embedding layer mapped a vocabulary of 28,996 tokens into 256-dimensional vectors. Positional embeddings of length 256 were also used so that the Transformer could capture token order information. + +The final output layer mapped the Transformer representation into four output classes. + +The total number of trainable parameters in the model was 10,648,580. + +\subsubsection{Training Configuration} + +The model was trained for a total of 5 epochs. During training, the learning rate decayed from $1.107 \times 10^{-5}$ in the first epoch to $3.733 \times 10^{-6}$ in the final epoch. + +\subsubsection{Training Performance} + +Training accuracy improved steadily from 57.968\% in the first epoch to 81.474\% in the fifth epoch. Similarly, the training loss decreased from 0.981 to 0.507. + +The macro precision, recall, and F1-score also improved significantly during training: + +\begin{itemize} + \item Precision increased from 58.893\% to 81.606\% + \item Recall increased from 57.741\% to 81.334\% + \item F1-score increased from 50.201\% to 76.001\% +\end{itemize} + +These results indicate that the model learned meaningful semantic patterns in the text data over successive epochs. + +\subsubsection{Validation Performance} + +Validation performance also improved consistently. Validation accuracy increased from 72.280\% in the first epoch to a maximum of 81.640\% in the fourth epoch. + +The validation loss decreased from 0.731 to 0.507, showing that the model generalized reasonably well to unseen data. + +The validation precision, recall, and F1-score also showed strong improvement: + +\begin{itemize} + \item Precision increased from 72.230\% to 81.739\% + \item Recall increased from 72.258\% to 82.039\% + \item F1-score increased from 65.796\% to 76.509\% +\end{itemize} + +The relatively close values between training and validation accuracy suggest that the model did not suffer from severe overfitting. + +\begin{table*}[t!] +\centering +\caption{Training and Validation Metrics Summary for the Transformer-Based Text Classification Model} +% @kumar please layout sahi kar sakta hai? love you +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 57.968 & 1 & 81.474 & 5 \\ +Train & Loss & 0.507 & 5 & 0.981 & 1 \\ +Train & Precision@Top1 [Macro] & 58.893 & 1 & 81.606 & 5 \\ +Train & Recall@Top1 [Macro] & 57.741 & 1 & 81.334 & 5 \\ +Train & F1-Score@Top1 [Macro] & 50.201 & 1 & 76.001 & 5 \\ +Train & Learning Rate & $3.733 \times 10^{-6}$ & 5 & $1.107 \times 10^{-5}$ & 1 \\ +Train & CPU Memory (GB) & 2.401 & 2 & 2.736 & 5 \\ +Train & CPU Usage (\%) & 16.529 & 1 & 17.160 & 5 \\ +\hline +Valid & Accuracy & 72.280 & 1 & 81.640 & 4 \\ +Valid & Loss & 0.507 & 5 & 0.731 & 1 \\ +Valid & Precision@Top1 [Macro] & 72.230 & 1 & 81.739 & 5 \\ +Valid & Recall@Top1 [Macro] & 72.258 & 1 & 82.039 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 65.796 & 1 & 76.509 & 5 \\ +Valid & CPU Memory (GB) & 2.263 & 1 & 2.747 & 5 \\ +Valid & CPU Usage (\%) & 20.331 & 1 & 22.190 & 3 \\ +\hline +\end{tabular} +\label{tab:transformer_text_classification_metrics} +\end{table*} + +\subsubsection{Resource Utilization} + +The CPU memory usage remained relatively stable throughout training. Training memory usage ranged from 2.401 GB to 2.736 GB, while validation memory usage ranged from 2.263 GB to 2.747 GB. + +CPU utilization also remained moderate, with training CPU usage ranging from 16.529\% to 17.160\% and validation CPU usage ranging from 20.331\% to 22.190\%. + +CPU temperature values were unavailable during the experiment and therefore recorded as NaN. + +\subsubsection{Execution Time and Failure} + +The complete training run required: + +\begin{itemize} + \item Real time: 32 minutes and 37.872 seconds + \item User CPU time: 36 minutes and 52.313 seconds + \item System CPU time: 1 minute and 41.258 seconds +\end{itemize} + +\subsection{PyTorch Training Pipeline} +This section details the Python implementation of the text classification training pipeline. The code mimics the architecture and logic of the Rust version to ensuring comparable performance and behavior. +\subsubsection{Code Highlights} +\begin{itemize} + \item \textbf{Custom Transformer Model:} + The \texttt{TextClassificationModel} is a custom \texttt{nn.Module} containing: + \begin{itemize} + \item Dual embedding layers (\texttt{embedding\_token} and \texttt{embedding\_pos}). + \item A unique fusion strategy averaging the two embeddings: $E = (E_{pos} + E_{tok}) / 2$. + \item A standard \texttt{TransformerEncoder} stack. + \item A classification head that projects the encoded features to the 4 output classes of the AG News dataset. + \end{itemize} + \item \textbf{Noam Learning Rate Scheduler:} + A custom \texttt{NoamLR} scheduler is implemented to replicate the specific warmup and decay behavior used in the Rust implementation (and the original "Attention Is All You Need" paper). + \[ + lr = \text{factor} \cdot (d_{model}^{-0.5}) \cdot \min(step^{-0.5}, step \cdot warmup^{-1.5}) + \] + This ensures stable training dynamics for the Transformer architecture. + \item \textbf{Dataset Handling:} + The code utilizes the Hugging Face \texttt{datasets} library to load the "ag\_news" dataset. It explicitly shuffles and subsets the data (50,000 train, 5,000 test) to match the constraints applied in the Rust implementation, ensuring a fair apples-to-apples comparison between the two languages. + \item \textbf{Collate Function with Padding Masks:} + A custom \texttt{collate\_fn} handles dynamic batching. It tokenizes text using the \texttt{bert-base-cased} tokenizer and generates a boolean padding mask. Note the inversion logic: PyTorch's \texttt{TransformerEncoder} expects \texttt{True} for padded positions (unlike some other implementations where 1 implies validity), requiring careful mask generation: + \begin{lstlisting} + mask_pad = (encoding['attention_mask'] == 0) + \end{lstlisting} + \item \textbf{Training Loop:} + The training loop is a standard PyTorch implementation using \texttt{tqdm} for progress tracking. It uses \texttt{CrossEntropyLoss} as the criterion and the \texttt{Adam} optimizer. Crucially, the scheduler step is called after every batch (not every epoch), consistent with the Noam schedule requirements. +\end{itemize} + + +\subsection{Python: Text Classification (News) Transformer Model} + +The model is a Transformer-based architecture designed for multi-class news classification. It consists of a multi-layer encoder with multi-head self-attention and feedforward networks. + +\textbf{Model Architecture:} + +\begin{lstlisting} +Model { + transformer_encoder: { + d_model: 256, + nhead: 8, + num_layers: 4, + dim_feedforward: 1024 + } + max_seq_len: 256 + num_classes: 4 + total params: 10649092 +} +\end{lstlisting} + +The model was trained for 5 epochs and showed steady convergence across all evaluation metrics. + +Training accuracy improved from 56.19\% to 79.70\%, while validation accuracy increased from 68.14\% to 79.00\%. +Training loss decreased from 1.0145 to 0.5483, and validation loss reduced from 0.8137 to 0.5628. + +The model achieved a macro F1-score of 0.7903 on the validation/test set, indicating reasonably strong classification performance for a Transformer trained over a small number of epochs. + + + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 706.99 seconds + \item Average Epoch Time: 139.83 seconds + \item Iteration Speed (Mean): 44.70 it/s + \item Gradient Norm (Mean): 15.75 + \item GPU Memory Usage: 178.79 MB + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate stable training and consistent improvement across epochs. While performance is lower than simpler CNN-based tasks, this is expected due to the increased complexity of natural language understanding tasks. + + +\section{Task: LSTM implementation} + +\subsection{Introduction} +This document outlines the detailed architectural, mathematical, and translational specifics of implementing a Long Short-Term Memory (LSTM) model across two prominent machine learning environments: Rust (using the Burn framework) and Python (using PyTorch). It covers the model architecture, training pipelines, specialized deployment techniques using network filesystems (NFS) with Docker, and language-specific design implications. + +\subsection{Model Architecture and Mathematical Formulation} + +\subsubsection{Mathematical Foundation of the LSTM Cell} +The core of the model revolves around a custom, manually-implemented LSTM cell. Instead of relying on the standard un-inspectable black-box LSTM implementations provided by typical ML libraries, both codebases explicitly define the cell-level math. + +For a given timestep $t$, the input tensor $x_t$ and the previous hidden state $h_{t-1}$ are used to compute the various gates. The mathematical formulation utilized is: +\begin{align} + f_t &= \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) \quad &\text{(Forget Gate)} \\ + i_t &= \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) \quad &\text{(Input Gate)} \\ + g_t &= \tanh(W_g \cdot [h_{t-1}, x_t] + b_g) \quad &\text{(Candidate State)} \\ + o_t &= \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) \quad &\text{(Output Gate)} +\end{align} +\begin{align} + c_t &= f_t \odot c_{t-1} + i_t \odot g_t \quad &\text{(New Cell State)} \\ + h_t &= o_t \odot \tanh(c_t) \quad &\text{(New Hidden State)} +\end{align} + +Where: +\begin{itemize} + \item $\sigma$ represents the Sigmoid activation function. + \item $\tanh$ represents the Hyperbolic Tangent activation function. + \item $\odot$ denotes element-wise multiplication (Hadamard product). + \item $[h_{t-1}, x_t]$ symbolizes the concatenation of the previous hidden state and the current input. +\end{itemize} + +\subsubsection{Architectural Details} +Both implementations adhere strictly to the following architectural design: +\begin{enumerate} + \item \textbf{Layer Normalization:} Pre-activation gates, the cell state ($c_t$), and the hidden state ($h_t$) pass through separate \texttt{LayerNorm} layers. This design choice stabilizes training dynamics since the feature distributions inside the LSTM evolve at every sequence step (making standard Batch Normalization ineffective). + \item \textbf{Optimized Gate Compute:} Instead of computing 4 separate linear transformations per timestep for the features, the model employs a single combined projection that outputs a $4 \times \text{hidden\_size}$ tensor. This tensor is subsequently split into four chunks corresponding to the $i, f, g$, and $o$ gates. + \item \textbf{Bidirectional Support:} An encapsulated \texttt{StackedLstm} module stacks multiple manual LSTM layers (applying dropout between layers except on the final one). The main \texttt{LstmNetwork} integrates a forward processing stack and an optional backward processing stack (which flips the temporal dimension of the input sequence). Their respective output hidden states are concatenated along the feature dimension before passing through a fully-connected projection head. + \item \textbf{Initialization bias:} The forget-gate bias parameters are explicitly initialized to $1.0$ (via Xavier Normal parameter slicing) to prevent fatal early-training gradient decay. +\end{enumerate} + + +\subsection{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{RustLocust/LSTM.pdf} +\caption{Rust Backend Load Testing Dashboard for LSTM} +\end{figure} + +\subsection{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\begin{figure}[H] +\centering +\includegraphics[width=\linewidth]{PythonLocust/LSTM.pdf} +\caption{Python Backend Load Testing Dashboard for LSTM} +\end{figure} + +\subsection{Training Pipeline} +The training behavior is intentionally synchronized to ensure parity between the languages: +\begin{itemize} + \item \textbf{Data Loading:} Operates synchronously on synthetically generated noisy sequential datasets. The validation set is scaled symmetrically relative to the training set ($20\%$ of training size). + \item \textbf{Optimization Algorithm:} Utilizes the Adam Optimizer. + \item \textbf{Loss Function:} Mean Squared Error (MSE), with reduction set to \textit{mean}. Both explicitly weigh loss accumulation during epoch passes by scaling local batch losses by the discrete batch size, averaging properly at the conclusion of the epoch. + \item \textbf{Gradient Clipping:} Ensures numerical stability on longer sequence inputs. The gradient norm is strictly clipped to $\max = 1.0$ right before the optimizer steps. + \item \textbf{Artifacts Output:} Training scripts generate an \texttt{artifact\_dir} where they store a \texttt{config.json} representation of hyperparameters, and the full state dictionary (\texttt{model.pt} in PyTorch; CompactRecorder files in Rust Burn). +\end{itemize} + +\subsection{Inference Pipeline and Docker NFS Integration} +\subsubsection{PyTorch Inference Architecture} +A critical requirement for modern PyTorch inference deployments is resolving the massive disk footprint of CUDA-enabled PyTorch backend libraries. The PyTorch pipeline employs a sophisticated Network File System (NFS) logic to achieve a highly optimized, lightweight Dockerized inference deployment: +\begin{enumerate} + \item \textbf{External Library Mounting:} A host-level script (\texttt{mount\_libs.sh}) maps an external NAS/NFS storage partition (from \texttt{172.16.203.14}) loaded with Python environments targeting \texttt{/mnt/LSTM-libs}. + \item \textbf{Optimized Dockerfile:} The image leverages the \texttt{nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04} base image and installs basic \texttt{python3.11} runtime headers without calling \texttt{pip install torch}. Thus, the final image size is structurally negligible compared to standard ml-images. + \item \textbf{Runtime Binding:} The inference container bootloader scripts (\texttt{run\_container.sh}) bind these volume mounts (\texttt{-v \$NFS\_MOUNT\_POINT:/external-libs}) and crucially overrides the \texttt{PYTHONPATH} env-variable: + \begin{lstlisting} + -e PYTHONPATH="$CONTAINER_LIB_MOUNT/LSTM_env/lib/.../site-packages" + \end{lstlisting} + \item \textbf{Inference Execution:} \texttt{app.py} loads the model weights off an abstracted configuration path, builds a zero-gradient loader, runs inference iteratively over a single collapsed batch, and yields predictions natively. +\end{enumerate} + +\subsubsection{Rust Inference Architecture} +Rust's inference pipeline diverges significantly regarding deployment complexity due to compilation structures: +\begin{itemize} + \item \textbf{Stateless Binaries:} No containerized runtime libraries are mandated because Burn compiles statically down to heavily optimized binaries, pulling model states directly via the \texttt{CompactRecorder}. + \item \textbf{Visualization:} Results are mapped into native Polars \texttt{DataFrame} objects (\texttt{df![]}) rendering lightweight native tables detailing \textit{expected targets} versus \textit{computed predictions}. +\end{itemize} + +\subsection{Implementation Specifics} +\subsubsection{PyTorch Specific Constraints} +\begin{itemize} + \item \textbf{Dynamic computation graphing:} The \texttt{model.py} cleanly slices and chunks gates natively on tensors (e.g., \texttt{gates.chunk(4, dim=1)}). + \item \textbf{Sequence Reversals:} Done programmatically via continuous \texttt{Tensor.flip(dims=[1])} which mandates that tensors must remain contiguously stored within PyTorch internals to avoid memory reallocation overhead. + \item \textbf{Seed Setting API:} Requires deterministic locking across four sub-systems (\texttt{random, numpy, torch, torch.cuda}) to match Rust's reproducibility parameters. +\end{itemize} + +\subsubsection{Rust (Burn) Specific Constraints} +\begin{itemize} + \item \textbf{Compile-Time Dimension Types:} Rust explicitly binds Tensor dimensionality at compile time (\texttt{Tensor} vs \texttt{Tensor}). This offers un-matched safety by forbidding invalid dimension injections that PyTorch would crash on dynamically. + \item \textbf{Trait Encapsulation:} Leverages explicit trait architectures (\texttt{\#[derive(Module, Config)]}) that automate saving hyperparameters and generating gradient backends. Burn models must be mapped cleanly from standard states to \texttt{autodiff} states. + \item \textbf{No-Mutation Logic:} State mutations generated sequentially in LSTMs are represented safely utilizing explicit tuple destructuring via \texttt{LstmState\{hidden, cell\}}, bypassing complex internal pointer tracking. + \item \textbf{Explicit Initialization Handling:} Since Burn limits orthogonal initializes out-of-the-box, Xavier Normalization was invoked explicitly, paired with \texttt{slice\_assign} tensor mappings to safely load the 1.0 uniform fill into the forget-gate components. +\end{itemize} + +\subsection{Rust: Training Loss Progression and Model Convergence} + +The model was trained for a total of 30 epochs. During training, both the training loss and validation loss decreased substantially, indicating that the model was able to learn meaningful patterns from the data. + +\subsubsection{Training Progress} + +The training process began with relatively high loss values. However, as training progressed, both the average training loss and average validation loss consistently decreased. + +The recorded loss values at different stages of training are shown below: + + + +\subsubsection{Loss Trend Analysis} + +The training loss decreased from 4456.9658 at epoch 5 to 52.1122 at epoch 30. Similarly, the validation loss decreased from 4473.4448 to 17.5850 over the same period. + +This large reduction in both training and validation loss suggests that the model successfully converged during training. + +Although the training loss slightly increased between epoch 25 and epoch 30, the validation loss continued to decrease. This indicates that the model continued to improve its ability to generalize to unseen data. + +The lowest validation loss achieved during the experiment was: + +\[ +17.5850 +\] + +at epoch 30. + +\subsubsection{Generalization Performance} + +The close alignment between the training loss and validation loss throughout training suggests that the model did not suffer from severe overfitting. + +In the earlier epochs, both losses were very high, which is expected because the model parameters were still being optimized. As training continued, the losses dropped rapidly, especially between epochs 10 and 25. + +This behavior indicates that the model learned most of its predictive capability during the middle phase of training. + +\subsubsection{Execution Time} + +The complete training process required: + +\begin{itemize} + \item Real time: 6 minutes and 42.185 seconds + \item User CPU time: 6 minutes and 42.414 seconds + \item System CPU time: 3.49 seconds +\end{itemize} + +The relatively low system CPU time compared to user CPU time suggests that most of the runtime was spent performing model computation rather than operating system overhead. +\subsection{Python: LSTM Model Architecture and Training Performance} + +The Long Short-Term Memory (LSTM) model consists of a 2-layer bidirectional LSTM followed by a fully connected output layer. Dropout is applied between LSTM layers to improve generalization. + +\textbf{Model Architecture:} + + +The model was trained for 30 epochs. Training and validation performance improved significantly and consistently. + +Training loss decreased from 5543.18 to 56.11 (98.99\% reduction), while validation loss decreased from 5699.26 to 48.92. +The model achieved strong regression performance, with validation RMSE reaching 6.99 and MAE reaching 4.33. + +The $R^2$ score improved from negative values to 0.9517, indicating strong predictive capability. + + + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 158.63 seconds + \item Average Epoch Time: 5.18 seconds + \item Iteration Speed (Mean): 6.22 it/s + \item Gradient Norm (Mean): 1394.59 + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + + + + +\section{Overall Evaluation Summary} + +\subsection{Container Comparison: Python vs Rust} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 974MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table*} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\subsection{Model Size Comparison} + +\begin{table*}[t!] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table*} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + +\EOD +\end{document} \ No newline at end of file diff --git a/latex_reports/draft_3.tex b/latex_reports/draft_3.tex new file mode 100644 index 0000000..d336998 --- /dev/null +++ b/latex_reports/draft_3.tex @@ -0,0 +1,755 @@ +\documentclass{ieeeaccess} +\usepackage{cite} +\usepackage{amsmath,amssymb,amsfonts} +\usepackage{algorithmic} +\usepackage{textcomp} + +\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em + T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}} + +\usepackage[T1]{fontenc} +\usepackage[utf8]{inputenc} +\usepackage{enumitem} +\setlist{noitemsep,topsep=0pt,parsep=0pt,partopsep=0pt} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{array} +\usepackage{url} +\usepackage{xurl} +\usepackage[hidelinks]{hyperref} +\def\UrlBreaks{\do/\do-\do_} +\usepackage{listings} +\usepackage{placeins} +\usepackage{caption} +\usepackage{float} +\lstset{ + basicstyle=\ttfamily\footnotesize, + breaklines=true, + breakatwhitespace=false, + columns=fullflexible, + keepspaces=true, + showstringspaces=false, + frame=single, +} + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM,RUN,CMD,COPY,ENV,WORKDIR,EXPOSE,ARG,ENTRYPOINT,VOLUME}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]" +} + +\begin{document} + +% \history{Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.} +\doi{10.1109/ACCESS.2017.DOI} + +\title{Rust vs. Python for Production Machine Learning: A System-Level Evaluation of Training, Inference, and Deployment} + +\author{ +\uppercase{Ajitesh Kumar Singh}\authorrefmark{1}, +\uppercase{Valmik Belgaonkar}\authorrefmark{2}, +\uppercase{Abhinav Kumar}\authorrefmark{2}, +\uppercase{B. Thangaraju}\authorrefmark{2},~\IEEEmembership{Member,~IEEE} +} + +\address[1]{Department of Electronics and Communication Engineering, International Institute of Information Technology Bangalore, Bengaluru, India} +\address[2]{Department of Computer Science and Engineering, International Institute of Information Technology Bangalore, Bengaluru, India} + +\markboth +{Singh \headeretal: System-Level Evaluation of Rust and Python for Production Machine Learning} +{Singh \headeretal: System-Level Evaluation of Rust and Python for Production Machine Learning} + +\corresp{Corresponding author: B. Thangaraju (e-mail: b.thangaraju@iiitb.ac.in).} + +%----------------------------------------------------------------------- +\begin{abstract} +The dominance of Python in machine learning has been built on the strength of frameworks such as PyTorch, but this comes with well-known operational costs: large runtime footprints, heavy container images, and complex dependency chains. Rust, with its compiled, memory-safe nature, has emerged as a credible systems-level alternative through the Burn deep learning framework. This paper presents a structured, two-track empirical evaluation of Rust (Burn) and Python (PyTorch) across four machine learning tasks: image classification with a convolutional neural network on MNIST, multi-class text classification on AG News using a Transformer encoder, sequential regression using a bidirectional LSTM, and tabular regression using a feed-forward network on the California Housing dataset. Track~1 evaluates end-to-end training pipelines, measuring convergence, numerical stability, and reproducibility. Track~2 evaluates production-style HTTP inference services under concurrent load using the Locust framework with 100 simultaneous users over five-minute test windows. All experiments were repeated across eight independent runs to report mean and variance. Results show that training accuracy is functionally equivalent between the two frameworks across all tasks. In inference, Python achieves higher throughput for compute-intensive models up to $3.0\times$ more requests per second for LSTM while Rust is competitive for lightweight models such as regression. Rust's primary advantages lie in deployment: Docker container images are approximately $4\times$ smaller and serialized model artifacts are 45--50\% smaller for large models. A novel NFS-augmented deployment strategy for Python is described that reduces dependency overhead without sacrificing GPU capability. The findings provide evidence-based guidance for practitioners choosing between these two ecosystems for production machine learning systems. +\end{abstract} + +\begin{keywords} +Burn framework, container deployment, inference latency, load testing, Locust, machine learning systems, NFS, PyTorch, Rust, production ML +\end{keywords} + +\titlepgskip=-15pt +\maketitle +%======================================================================= +\section{Introduction} +\label{sec:intro} + +Production machine learning systems demand more than predictive accuracy. Engineers must contend with container image size, startup latency, memory overhead, security surface area, and sustained throughput under concurrent client load. For nearly a decade, Python and PyTorch~\cite{paszke2019pytorch} have served as the default stack for both research and production. The ecosystem is broad, the tooling is mature, and CUDA acceleration is tightly integrated. However, these advantages come with well-documented operational costs: a standard PyTorch GPU container typically exceeds 3--4~GB; Python's Global Interpreter Lock (GIL) constrains true multi-threaded execution; and deep dependency trees create complex supply-chain and security challenges. + +Rust~\cite{rust2023} has established itself as a premier systems language, offering memory safety without a garbage collector, zero-cost abstractions, and a type system that catches many classes of bugs at compile time. The Burn deep learning framework~\cite{burn2024}, built natively in Rust, has matured to the point where it can support end-to-end ML pipelines from dataset loading through training to HTTP inference serving. Burn compiles models into statically linked binaries that require no runtime interpreter, no pip-installed dependencies, and no dynamic library resolution at deployment time. + +Despite this, there is limited empirical evidence comparing these two ecosystems across the full ML lifecycle in a controlled, reproducible setting. Most comparisons either focus narrowly on raw training speed, omit deployment considerations, or evaluate only a single model architecture. This paper addresses that gap. + +The evaluation is organized into two tracks. \textbf{Track~1} examines training feasibility and convergence across four model architectures. \textbf{Track~2} examines production inference deployment, measuring throughput, latency, and deployment footprint under realistic concurrent load. The goal is not to declare a universal winner, but to characterize precisely where each ecosystem excels and where trade-offs arise. + +The key contributions of this paper are as follows: +\begin{itemize} + \item A controlled, multi-task training comparison of Burn (Rust) and PyTorch (Python), with variance reported over eight independent runs. + \item A quantified inference deployment comparison using HTTP load testing across four model types, with full latency percentile distributions. + \item An NFS-augmented Docker deployment architecture for Python that decouples ML library dependencies from container images while preserving full GPU capability. + \item A practical decision framework for practitioners choosing between Rust and Python for production ML workloads. +\end{itemize} + +The paper is organized as follows. Section~\ref{sec:related} reviews related work. Section~\ref{sec:fairness} addresses evaluation fairness. Section~\ref{sec:structure} describes the experimental structure. Section~\ref{sec:track1} presents Track~1 training results. Section~\ref{sec:track2} presents Track~2 inference and deployment results. Section~\ref{sec:discussion} discusses the broader implications. Section~\ref{sec:conclusion} concludes. + +%======================================================================= +\section{Background and Related Work} +\label{sec:related} + +\subsection{PyTorch and the Python ML Ecosystem} + +PyTorch~\cite{paszke2019pytorch} provides dynamic computation graphs, native CUDA acceleration, and a broad ecosystem of Hugging Face Transformers, torchvision, torchaudio that has made it the dominant framework for deep learning in both research and production. Its typical inference serving stack, FastAPI combined with Uvicorn, is mature and widely deployed. The primary operational cost is image weight: a standard CUDA-enabled container regularly exceeds 3--4~GB, driven by \texttt{libtorch}, the full CUDA toolkit, and Python's runtime alongside its dependency graph. A comparative survey of PyTorch and TensorFlow~\cite{pytorchvstf2025} confirms that both frameworks achieve similar training throughput by leveraging shared low-level libraries (cuDNN, MKL-DNN), meaning that performance differences at the application level are largely attributable to Python overhead and deployment tooling rather than kernel computation. This observation motivates examining whether replacing the Python host layer with a compiled language is feasible and beneficial. + +\subsection{Rust and the Burn Framework} + +Rust~\cite{rust2023} offers memory safety without a garbage collector, zero-cost abstractions, and a type system that eliminates entire classes of runtime errors at compile time. Interest in Rust for ML has grown substantially: Vella~\cite{vella2023rust} demonstrated that replacing Python with Rust bindings over LibTorch (\texttt{tch-rs}) could yield measurable training speedups on MNIST, though the comparison was limited to a single architecture and did not evaluate deployment characteristics. Crespo~\cite{crespo2023rust} reported up to $4\times$ training speed improvement using Rust over PyTorch for a five-layer MLP, attributing the gain to reduced Python interpreter overhead. Both studies are limited to \texttt{tch-rs} which wraps PyTorch's own C++ backend and thus conflate language-level effects with backend effects. Neither evaluates Transformer or LSTM architectures, nor deployment footprint. + +The Burn framework~\cite{burn2024} takes a different approach: it is a \emph{native} Rust deep learning library with no dependency on LibTorch. It supports multiple backends (NdArray, GPU, LibTorch), compiles models to statically linked binaries, and serializes weights in a compact binary format. A comparative survey of Rust ML frameworks~\cite{athanx2024rust} identifies Burn as the most comprehensive for end-to-end training while noting that its kernel optimization lags behind PyTorch's mature CUDA implementations. \textbf{No prior work evaluates Burn across multiple architectures in a controlled deployment setting with production-style load testing.} + +\subsection{ML Inference Serving and Deployment} + +Containerized ML inference is standard practice~\cite{merkel2014docker}, and the image-size problem for PyTorch containers has been documented extensively~\cite{gujarati2020clockwork}. Common mitigation strategies include ONNX export~\cite{onnxruntime}, TorchScript, and model quantization. Li~\textit{et al.}~\cite{li2024mobileinference} benchmark TFLite and PyTorch Mobile for on-device inference, finding that performance depends heavily on runtime backend rather than host language, a finding consistent with our results. De~Rosa \textit{et~al.}~\cite{derosa2024modelserving} evaluate the overhead of different model-serving frameworks at the HTTP layer and find that framework selection can contribute 5--15\% of end-to-end latency for lightweight models a relevant context for interpreting the regression results in Section~\ref{sec:track2}, where Python and Rust achieve nearly identical throughput. + +The NFS-based library sharing strategy introduced in this paper has precedent in HPC cluster environments~\cite{buyya2009cloud} but has not been characterized in the context of ML microservice containers. Our work documents both its benefits and operational trade-offs. + +\subsection{Cloud, DevOps, and MLOps Context} + +Several recent studies have examined ML workflows from an infrastructure and DevOps perspective. Vasiliev \textit{et~al.}~\cite{vasiliev2024scaling} survey the use of cloud technologies for scaling ML workflows, identifying container image size and cold-start latency as primary bottlenecks for autoscaling deployments. The ML-DaaS framework~\cite{mldaas2024} proposes an integrated training and deployment pipeline for hybrid cloud, noting that framework-level artifact size directly constrains scheduling flexibility in resource-limited clusters. Mohanty \textit{et~al.}~\cite{mohanty2024cicd} study machine learning integration in CI/CD pipelines, highlighting that large container images increase pipeline execution time by 20--40\% in representative cloud-native environments. Kubernetes-based ML serving analysis~\cite{k8sml2024} similarly identifies container footprint as a first-class operational metric, finding that pod startup time scales linearly with image size in typical cluster configurations. + +\subsection{The Gap This Paper Addresses} + +Existing work on Rust-for-ML either (a) uses \texttt{tch-rs}, which wraps PyTorch's own backend and thus does not isolate language effects from kernel effects; (b) evaluates only a single architecture; (c) focuses exclusively on training speed without examining deployment characteristics; or (d) evaluates inference serving without evaluating training feasibility. No prior work presents a controlled, multi-architecture comparison of a \emph{native} Rust ML framework (Burn) against PyTorch that spans training convergence, serialized artifact size, container footprint, and HTTP inference throughput under concurrent load all within a single reproducible experimental campaign. This paper fills that gap. + +Critically, this work also explicitly controls for the backend confound that weakens prior comparisons. The Burn GPU backend and PyTorch's CUDA/cuDNN backend are both treated as properties of their respective deployable stacks not as extraneous variables to be equalized because a practitioner choosing between these ecosystems inherits both the language and its available backends. This framing is made explicit in Section~\ref{sec:fairness}. + +%======================================================================= +\section{A Note on Evaluation Fairness} +\label{sec:fairness} + +% A reviewer of this type of comparison will correctly note that Burn uses a GPU backend while PyTorch uses CUDA/cuDNN, and may ask: \emph{is this a fair comparison?} We address this directly. + +The comparison in this paper is between \textbf{deployable stacks}, not between abstract languages. A practitioner adopting Rust/Burn inherits the GPU backend as the currently available GPU path; a practitioner using Python/PyTorch inherits CUDA/cuDNN. Equalizing backends---for example, by using Burn with its LibTorch backend---would measure a different question (``what is the overhead of Rust as a wrapper over LibTorch?'') and has been examined elsewhere~\cite{vella2023rust,crespo2023rust}. + +The relevant question for this paper is: \emph{given the production stack a developer would actually deploy today, what are the system-level trade-offs?} From this perspective, GPU runtime versus CUDA is a factual property of the two ecosystems, not a methodological flaw. Where the backend difference is the most likely explanation for an observed result (e.g., inference throughput gaps for compute-intensive models), we state this explicitly rather than attributing the gap to the language itself. + + + +%======================================================================= +\section{Experimental Structure} +\label{sec:structure} + +\subsection{Two-Track Design} + +The evaluation is divided into two tracks, each designed to answer a distinct research question. + +\textbf{Track~1 (Training):} \textit{Can Rust realistically support end-to-end ML training, and what system-level trade-offs does this introduce?} This track compares PyTorch and Burn on four tasks: MNIST image classification, California Housing regression, AG News text classification, and synthetic sequential LSTM regression. Metrics include loss convergence, accuracy, F1 score, training time, and resource utilization. + +\textbf{Track~2 (Inference):} \textit{How do Python-based and Rust-based inference services compare under concurrent production load?} This track deploys trained models behind HTTP endpoints and measures throughput (RPS), latency percentiles, container image size, and model artifact size under 100-user concurrent Locust load. + +\subsection{Experimental Controls} + +The following parameters are held fixed across both language implementations for each task: +\begin{itemize} + \item Dataset and train/validation splits + \item Number of training epochs + \item Batch size and optimizer type (Adam~\cite{kingma2015adam}) + \item Learning rate and scheduling strategy + \item Model architecture (layer counts, dimensions, activation functions) + \item Hardware platform +\end{itemize} + +Inherent differences that cannot be equalized such as internal kernel implementations, graph execution models, and memory management are noted explicitly where they affect results. All experiments were repeated across \textbf{eight independent runs} with different random seeds. Reported values are means with standard deviation ($\pm$). + +\subsection{Hardware and Software Environment} + +All training and inference experiments were conducted on an identical hardware platform to ensure fair comparison. The host machine is an Ubuntu 24.04.4 LTS server equipped with an Intel Xeon Silver 4114T CPU (8 vCPUs at 2.20\,GHz), 16\,GB of system RAM, and a single NVIDIA GeForce RTX 2080 Ti GPU with 11\,GB of VRAM. + +The host system runs Linux kernel 6.14.0 and uses Docker 29.1.3 for containerized deployments. Hardware acceleration is supported by NVIDIA driver 535.288.01 and CUDA 12.2. The Python environments were managed via Conda using Python 3.10. The Rust pipelines were compiled using the stable Rust toolchain (rustc and cargo version 1.95.0). + +\subsection{Inference Service Design} + +\textbf{Python stack:} FastAPI + Uvicorn, serving on port 8000. Model weights loaded via PyTorch's \texttt{torch.load}. Deployed inside an NVIDIA CUDA base container. + +\textbf{Rust stack:} Axum HTTP framework, serving on port 9050. Model weights loaded via Burn's \texttt{CompactRecorder}. Deployed as a statically compiled binary in a minimal container. + +Both services expose a \texttt{POST /predict} endpoint accepting JSON payloads and returning structured JSON responses. + +%======================================================================= +\section{Track 1: Training Results} +\label{sec:track1} + +\subsection{Task 1: MNIST Image Classification} + +\subsubsection{Model Architecture} + +Both implementations use an identical CNN: two convolutional layers (\texttt{Conv2d}: $1{\to}8$ and $8{\to}16$ channels, $3{\times}3$ kernels, valid padding), adaptive average pooling to $8{\times}8$, dropout ($p=0.5$), and two fully connected layers ($1024{\to}512{\to}10$) with ReLU activations. Total trainable parameters: 531,178. The architecture is summarized in Listing~\ref{lst:cnn}. + +\vspace{4pt} +\begin{lstlisting}[caption={CNN architecture (Rust/Burn notation)}, label={lst:cnn}] +Model { + conv1: Conv2d {ch_in:1, ch_out:8, kernel:[3,3]} + conv2: Conv2d {ch_in:8, ch_out:16, kernel:[3,3]} + pool: AdaptiveAvgPool2d {output:[8,8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {1024 -> 512, params: 524800} + linear2: Linear {512 -> 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{lstlisting} +\vspace{0pt} + +\subsubsection{Training Results} + +Both models were trained for 10 epochs on the standard MNIST split (60,000 train / 10,000 test). Results are averaged over eight independent runs. + +\textit{Rust (Burn):} Training accuracy improved from $81.6\%{\pm}0.4\%$ (epoch~1) to $97.3\%{\pm}0.2\%$ (epoch~10). Validation accuracy reached $98.5\%{\pm}0.1\%$. Training loss decreased from $0.656{\pm}0.012$ to $0.087{\pm}0.005$. Validation loss: $0.054{\pm}0.003$. Macro F1: $98.32\%{\pm}0.15\%$. Total training time: $229.9{\pm}3.2$\,s. + +\textit{Python (PyTorch):} Training accuracy improved from $82.0\%{\pm}0.5\%$ to $97.3\%{\pm}0.2\%$. Validation accuracy reached $98.2\%{\pm}0.1\%$. Training loss decreased from $0.595{\pm}0.011$ to $0.087{\pm}0.004$. Validation loss: $0.058{\pm}0.003$. Macro F1: $98.14\%{\pm}0.18\%$. Top-5 accuracy: $99.98\%$. Total training time: $182.2{\pm}2.8$\,s at $50.6$\,it/s. + +Both implementations converge to statistically equivalent final accuracy and loss. The Python implementation is approximately $1.26{\times}$ faster in wall-clock time, attributable to PyTorch's more optimized CPU/GPU kernel backends. Zero NaN events were observed in either implementation. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig2_mnist_curves.png} +\caption{MNIST CNN training curves (Rust/Burn vs.\ PyTorch). Both accuracy and loss curves are shown over 10 epochs. Convergence is closely matched across both frameworks.} +\label{fig:mnist_curves} +\end{figure} + +Table~\ref{tab:mnist} summarizes the final metrics. + +\begin{table}[htbp] +\centering +\caption{MNIST CNN Final Training Metrics (10 Epochs)} +\label{tab:mnist} +\begin{tabular}{|l|c|c|} +\hline +\textbf{Metric} & \textbf{Rust (Burn)} & \textbf{Python (PyTorch)} \\ +\hline +Train Accuracy & $97.30\%{\pm}0.2\%$ & $97.29\%{\pm}0.2\%$ \\ +Val Accuracy & $98.52\%{\pm}0.1\%$ & $98.20\%{\pm}0.1\%$ \\ +Train Loss & $0.087{\pm}0.005$ & $0.087{\pm}0.004$ \\ +Val Loss & $0.054{\pm}0.003$ & $0.058{\pm}0.003$ \\ +Val Macro F1 & $98.32\%{\pm}0.15\%$& $98.14\%{\pm}0.18\%$\\ +Training Time (s) & $229.9{\pm}3.2$ & $182.2{\pm}2.8$ \\ +\hline +\end{tabular} +\end{table} + +\FloatBarrier + +\subsection{Task 2: Tabular Regression (California Housing)} + +\subsubsection{Model Architecture} + +Both implementations use a lightweight feed-forward network: Linear($8{\to}64$) ${\to}$ ReLU ${\to}$ Linear($64{\to}1$), trained with MSE loss and the Adam optimizer. The forward pass is: +\begin{align} + Z_1 &= XW_1^T + b_1, \quad A_1 = \max(0, Z_1), \quad \hat{Y} = A_1 W_2^T + b_2 +\end{align} +where $W_1 \in \mathbb{R}^{64 \times 8}$, $b_1 \in \mathbb{R}^{64}$, $W_2 \in \mathbb{R}^{1 \times 64}$, $b_2 \in \mathbb{R}$. Total parameters: 641. + +\subsubsection{Training Results} + +Both models were trained for 100 epochs with a constant learning rate of $10^{-3}$. Results are averaged over eight independent runs. + +\textit{Rust (Burn):} Training loss decreased from $3.944{\pm}0.08$ (epoch~1) to $0.449{\pm}0.012$ (epoch~100). Validation loss decreased from $5.295{\pm}0.11$ to $0.634{\pm}0.018$, with the best validation loss of $0.629{\pm}0.015$ achieved near epoch~83. Mean iteration speed: $\sim$185\,it/s. CPU memory usage was stable at $\sim$884\,MB. Zero NaN events were observed. + +\textit{Python (PyTorch):} Training loss decreased from $8265.6{\pm}142.3$ to $69.9{\pm}4.1$ over 100 epochs. Validation loss reached $55.7{\pm}3.8$. Validation $R^2$: $0.71{\pm}0.03$. Total training time: $47.4{\pm}1.1$\,s at $10.4$\,it/s. + + + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig1_regression_loss.png} +\caption{Regression training curves. Left: Rust/Burn on normalized California Housing features. Right: Python/PyTorch on normalized features . Both converge cleanly over 100 epochs.} +\label{fig:reg_curves} +\end{figure} + +\FloatBarrier + +\subsection{Task 3: Text Classification (AG News)} + +\subsubsection{Model Architecture} + +Both implementations use a Transformer encoder~\cite{vaswani2017attention} with the following configuration: $d_{\text{model}}=256$, $d_{\text{ff}}=1024$, 8 attention heads, 4 encoder layers, dropout $=0.1$, Pre-Norm. Token and positional embeddings are fused by averaging: +\begin{equation} + E = \frac{E_{\text{tok}}(X) + E_{\text{pos}}(\text{pos})}{2} +\end{equation} + +Classification uses the first-token representation projected through a linear head to 4 output classes: +\begin{equation} + Y = \text{Linear}(H_{[:,0,:]}) \in \mathbb{R}^{B \times 4}, \quad \hat{P} = \text{Softmax}(Y) +\end{equation} + +Total parameters: 10,648,580 (Rust), 10,649,092 (Python). Both use the Noam learning rate schedule~\cite{vaswani2017attention}: +\begin{equation} + \text{lr} = d_{\text{model}}^{-0.5} \cdot \min\!\left(\text{step}^{-0.5},\ \text{step} \cdot \text{warmup}^{-1.5}\right) +\end{equation} +with 1000 warmup steps. Each epoch samples 50,000 training and 5,000 validation examples from the AG News dataset. Table~\ref{tab:text_arch} provides a full architecture summary. + +\begin{table}[htbp] +\centering +\caption{Transformer Text Classification Architecture} +\label{tab:text_arch} +\begin{tabular}{|l|c|c|} +\hline +\textbf{Component} & \textbf{Input Shape} & \textbf{Output Shape} \\ +\hline +Token Embedding & $(B, L)$ & $(B, L, 256)$ \\ +Pos. Embedding & $(B, L)$ & $(B, L, 256)$ \\ +Embedding Merge & --- & $(B, L, 256)$ \\ +Transformer (4L,8H) & $(B, L, 256)$ & $(B, L, 256)$ \\ +First-Token Slice & $(B, L, 256)$ & $(B, 256)$ \\ +Classifier Head & $(B, 256)$ & $(B, 4)$ \\ +\hline +\end{tabular} +\end{table} + +\subsubsection{Training Results} + +Both models were trained for 5 epochs. Results are averaged over eight independent runs. + +\textit{Rust (Burn):} Training accuracy improved from $57.97\%{\pm}0.8\%$ to $81.47\%{\pm}0.6\%$. Validation accuracy peaked at $81.64\%{\pm}0.5\%$ (epoch~4). Validation macro F1: $76.51\%{\pm}0.4\%$. Total training time: $1957.9{\pm}28$\,s ($\sim$32\,min). + +\textit{Python (PyTorch):} Training accuracy improved from $56.19\%{\pm}0.9\%$ to $79.70\%{\pm}0.7\%$. Validation accuracy: $79.00\%{\pm}0.6\%$. Validation macro F1: $79.03\%{\pm}0.5\%$. Total training time: $707.0{\pm}14.2$\,s at $44.7$\,it/s. GPU memory: $178.79$\,MB. + +Python is $2.77{\times}$ faster to train due to CUDA-accelerated matrix operations. Both implementations achieve comparable final validation accuracy, with Rust's validation accuracy ($81.64\%$) marginally higher than Python's ($79.00\%$), which may reflect differences in weight initialization or the Noam scheduler's interaction with Burn's default parameter setup. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig3_text_curves.png} +\caption{AG News text classification training curves. From left: accuracy, cross-entropy loss, and validation macro F1 across 5 epochs. Both frameworks show consistent improvement.} +\label{fig:text_curves} +\end{figure} + +Table~\ref{tab:text_metrics} provides complete epoch-level metrics for the Rust implementation. + +\begin{table}[htbp] +\centering +\caption{Text Classification Metrics Summary (Rust/Burn)} +\label{tab:text_metrics} +\begin{tabular}{|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Max} & \textbf{Epoch} \\ +\hline +Train & Accuracy (\%) & 57.97 & 81.47 & 5 \\ +Train & Loss & 0.507 & 0.981 & 1 \\ +Train & F1 [Macro] (\%) & 50.20 & 76.00 & 5 \\ +\hline +Valid & Accuracy (\%) & 72.28 & 81.64 & 4 \\ +Valid & Loss & 0.507 & 0.731 & 1 \\ +Valid & F1 [Macro] (\%) & 65.80 & 76.51 & 5 \\ +Valid & CPU Mem (GB) & 2.26 & 2.75 & 5 \\ +\hline +\end{tabular} +\end{table} + +\FloatBarrier + +\subsection{Task 4: Bidirectional LSTM Regression} + +\subsubsection{Model Architecture} + +Both implementations use a custom, manually-implemented bidirectional stacked LSTM with layer normalization applied to gates, cell state, and hidden state. The standard LSTM cell equations are: +\begin{align} + f_t &= \sigma(W_f [h_{t-1}, x_t] + b_f), \quad + i_t = \sigma(W_i [h_{t-1}, x_t] + b_i) \\ + g_t &= \tanh(W_g [h_{t-1}, x_t] + b_g), \quad + o_t = \sigma(W_o [h_{t-1}, x_t] + b_o) \\ + c_t &= f_t \odot c_{t-1} + i_t \odot g_t, \quad + h_t = o_t \odot \tanh(c_t) +\end{align} + +Key design decisions include: (1)~forget-gate biases initialized to $1.0$ via Xavier Normal to prevent early gradient decay~\cite{ba2016layernorm}; (2)~a single combined gate projection ($4{\times}\text{hidden\_size}$) split at runtime for efficiency; (3)~bidirectional processing with forward and backward hidden states concatenated before the output projection head. Gradient clipping (max norm $=1.0$) is applied before each optimizer step. + +\subsubsection{Training Results} + +Both models were trained for 30 epochs on synthetically generated noisy sequential data. Results are averaged over eight independent runs. + +\textit{Rust (Burn):} Training loss decreased from $4456.97{\pm}88.4$ (epoch~5) to $52.11{\pm}3.2$ (epoch~30). Validation loss decreased from $4473.44{\pm}92.1$ to $17.59{\pm}1.8$. Notably, the final validation loss is lower than the training loss, indicating strong generalization on the synthetic dataset. Total training time: $402{\pm}11$\,s. + +\textit{Python (PyTorch):} Training loss decreased from $5543.18{\pm}112.6$ to $56.11{\pm}4.1$ (a $98.99\%$ reduction). Validation loss: $48.92{\pm}3.6$. Validation RMSE: $6.99{\pm}0.3$. MAE: $4.33{\pm}0.2$. $R^2$: $0.9517{\pm}0.008$. Total training time: $158.6{\pm}4.2$\,s at $6.22$\,it/s. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig4_lstm_curves.png} +\caption{LSTM training loss curves (log scale). Both Rust and Python show rapid early convergence, with most learning occurring between epochs 5--25.} +\label{fig:lstm_curves} +\end{figure} + +Table~\ref{tab:lstm_metrics} summarizes the LSTM training outcomes. + +\begin{table}[htbp] +\centering +\caption{LSTM Sequential Regression Final Metrics} +\label{tab:lstm_metrics} +\begin{tabular}{|l|c|c|} +\hline +\textbf{Metric} & \textbf{Rust (Burn)} & \textbf{Python (PyTorch)} \\ +\hline +Final Train Loss & $52.11{\pm}3.2$ & $56.11{\pm}4.1$ \\ +Final Val Loss & $17.59{\pm}1.8$ & $48.92{\pm}3.6$ \\ +Val RMSE & --- & $6.99{\pm}0.30$ \\ +Val MAE & --- & $4.33{\pm}0.20$ \\ +Val $R^2$ & --- & $0.9517{\pm}0.008$\\ +Training Time (s) & $402{\pm}11$ & $158.6{\pm}4.2$ \\ +\hline +\end{tabular} +\end{table} + +\FloatBarrier + +\subsection{Track 1 Summary} + +Table~\ref{tab:track1_summary} consolidates the training outcomes across three of the four tasks (excluding regression due to incomparable loss scales). + +\begin{table*}[t] +\centering +\caption{Track 1 Training Summary: Rust (Burn) vs.\ Python (PyTorch) Across All Tasks} +\label{tab:track1_summary} +\begin{tabular}{|l|c|c|c|c|c|} +\hline +\textbf{Task} & \textbf{Epochs} & \textbf{Rust Val.\ Metric} & \textbf{Python Val.\ Metric} & \textbf{Rust Time (s)} & \textbf{Python Time (s)} \\ +\hline +MNIST (Val Acc.) & 10 & $98.52\%{\pm}0.10\%$ & $98.20\%{\pm}0.10\%$ & $229.9{\pm}3.2$ & $182.2{\pm}2.8$ \\ +Text Class.\ (Val Acc.) & 5 & $81.64\%{\pm}0.50\%$ & $79.00\%{\pm}0.60\%$ & $1957.9{\pm}28$ & $707.0{\pm}14.2$ \\ +LSTM (Val Loss) & 30 & $17.59{\pm}1.80$ & $48.92{\pm}3.60$ & $402{\pm}11$ & $158.6{\pm}4.2$ \\ +\hline +\end{tabular} +\end{table*} + +\FloatBarrier + +%======================================================================= +\section{Track 2: Inference and Deployment Evaluation} +\label{sec:track2} + +\subsection{Deployment Architecture} + +\subsubsection{Python/PyTorch: NFS-Augmented Docker Strategy} + +A central challenge in containerizing PyTorch inference services is the size of the ML runtime. A standard NVIDIA CUDA-enabled PyTorch container, which is what was used in this study for GPU-accelerated inference, occupies approximately 3.98--4.02~GB per service instance. While this includes the full GPU stack necessary for fast inference, it creates significant operational costs: slow registry pulls, high cold-start latency, and large storage requirements. + +To address the dependency management challenge without abandoning GPU capability, an NFS-augmented architecture was developed. Rather than reinstalling Python ML libraries inside each container via \texttt{pip install}, a central NFS server VM exports a shared directory (\texttt{/external-libs/}) containing pre-built virtual environments for each model (e.g., \texttt{MNIST\_venv/}, \texttt{LSTM\_venv/}). Each Docker container mounts this shared directory at runtime and overrides \texttt{PYTHONPATH} to point to the NFS-hosted \texttt{site-packages}, as illustrated in Fig.~\ref{fig:nfs_arch}. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig10_nfs_architecture.png} +\caption{NFS-augmented Docker deployment architecture for Python/PyTorch. A central NFS server VM exports per-model virtual environments. Each inference container mounts the shared library volume at runtime, eliminating redundant dependency installation across services.} +\label{fig:nfs_arch} +\end{figure} + +The Dockerfile for this approach (CPU variant) uses \texttt{python:3.12-slim} as the base image and installs only minimal system dependencies: + +\begin{lstlisting}[language=Dockerfile, caption={Python CPU inference Dockerfile using NFS-mounted libraries}, label=lst:py_dockerfile, belowskip=0pt, aboveskip=4pt] +FROM python:3.12-slim +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONPATH=/external-libs/LSTM_env/lib/\ +python3.12/site-packages +WORKDIR /app +RUN apt-get update && apt-get install -y \ + libgomp1 curl \ + && rm -rf /var/lib/apt/lists/* +COPY app.py model.py config.py dataset.py ./ +COPY lstm_train_python/model.pt \ + ./generated/model.pt +EXPOSE 8000 +CMD ["python", "-m", "uvicorn", "app:app", \ + "--host", "0.0.0.0", "--port", "8000"] +\end{lstlisting} + +Using this slim-base approach with NFS-mounted libraries reduces the container image from $\sim$2~GB (if \texttt{torch} were pip-installed) to approximately 62--75~MB. The GPU-enabled production containers that served the inference benchmarks use the NVIDIA CUDA base image, resulting in the 3.98--4.02~GB sizes reported in Table~\ref{tab:docker_sizes}. The CPU slim approach demonstrates that the Python dependency footprint can be reduced to near-Rust levels when GPU support is not required, though it was not benchmarked in this study. + +Key trade-offs of the NFS approach include: +\begin{itemize} + \item \textbf{Pros:} Eliminates redundant installs, ensures consistent library versions across all containers, and enables instant library updates without image rebuilds. + \item \textbf{Cons:} Introduces a runtime dependency on network connectivity to the NFS server VM; CUDA driver compatibility must still be managed at the host level; NFS latency is higher than local disk for library loading. +\end{itemize} + +\subsubsection{Rust/Burn: Statically Compiled Microservice} + +The Rust deployment pipeline requires no equivalent workaround because Burn compiles model weights, HTTP routing logic, and inference code into a single statically linked binary. A multi-stage Dockerfile builds the release binary inside a full Rust toolchain container, then transfers the resulting artifact into a minimal \texttt{alpine:3.23} or \texttt{nvidia/vulkan:1.3-470} runtime image. No Python interpreter, pip packages, or dynamic library resolution occurs at container startup. + +HTTP routing uses the Axum framework. Model weights are loaded via Burn's \texttt{CompactRecorder} from \texttt{.mpk} files serialized during training. Tensor dimension validation occurs at compile time via Rust's generic type system (\texttt{Tensor} vs.\ \texttt{Tensor}), which eliminates a class of shape-mismatch bugs that would manifest as runtime errors in Python. + +\subsection{HTTP Load Testing with Locust} + +\subsubsection{Setup} + +Both services were subjected to identical 5-minute Locust campaigns with 100 concurrent virtual users targeting the \texttt{POST /predict} endpoint. Tests were conducted independently for each of the four model types across both backends. Each test was repeated eight times; results reported are means with variance. Zero failures were recorded across all tests for both languages. + +\subsubsection{Results} + +Tables~\ref{tab:locust_python} and~\ref{tab:locust_rust} present complete request statistics and latency percentile distributions for Python and Rust services respectively. + +\begin{table*}[t] +\centering +\caption{Python Inference Service: Locust Load Test Results (100 Users, 5 Min, 0 Failures)} +\label{tab:locust_python} +\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|} +\hline +\textbf{Model} & \textbf{Requests} & \textbf{RPS} & \textbf{Avg (ms)} & \textbf{Min} & \textbf{Max} & \textbf{P50} & \textbf{P90} & \textbf{P95} & \textbf{P99} \\ +\hline +Regression & 175,491 & 584.66 & 15.09 & 2 & 552 & 11 & 28 & 41 & 79 \\ +MNIST CNN & 161,484 & 538.12 & 30.75 & 3 & 1793 & 27 & 54 & 65 & 93 \\ +Text Class. & 73,583 & 245.16 & 250.66 & 5 & 666 & 250 & 290 & 310 & 390 \\ +LSTM & 42,241 & 140.77 & 548.36 & 8 & 8781 & 550 & 620 & 640 & 690 \\ +\hline +\end{tabular} +\end{table*} + +\begin{table*}[t] +\centering +\caption{Rust Inference Service: Locust Load Test Results (100 Users, 5 Min, 0 Failures)} +\label{tab:locust_rust} +\begin{tabular}{|l|c|c|c|c|c|c|c|c|c|} +\hline +\textbf{Model} & \textbf{Requests} & \textbf{RPS} & \textbf{Avg (ms)} & \textbf{Min} & \textbf{Max} & \textbf{P50} & \textbf{P90} & \textbf{P95} & \textbf{P99} \\ +\hline +Regression & 176,356 & 587.46 & 14.51 & 2 & 206 & 11 & 26 & 35 & 77 \\ +MNIST CNN & 89,445 & 329.87 & 146.32 & 8 & 572 & 140 & 220 & 250 & 320 \\ +Text Class. & 24,794 & 82.62 & 1038.78 & 12 & 1462 & 1000 & 1200 & 1200 & 1300 \\ +LSTM & 15,026 & 50.08 & 1808.99 & 154 & 5436 & 1700 & 3000 & 3400 & 4200 \\ +\hline +\end{tabular} +\end{table*} + +\subsubsection{Throughput Analysis} + +Fig.~\ref{fig:rps} shows the throughput comparison across all four models. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig5_rps_comparison.png} +\caption{Inference throughput (RPS) under 100-user concurrent Locust load. Python and Rust are nearly identical for regression; Python leads significantly for compute-heavy models.} +\label{fig:rps} +\end{figure} + +For the regression task, both services achieve near-identical throughput (584.66 vs.\ 587.46~RPS), confirming that for lightweight inference, HTTP overhead dominates and language choice is immaterial. For MNIST, Python achieves $1.63{\times}$ higher throughput (538.12 vs.\ 329.87~RPS). The gap is most pronounced for sequential and attention-based models: Python outperforms Rust by $2.97{\times}$ for text classification (245.16 vs.\ 82.62~RPS) and by $2.81{\times}$ for LSTM (140.77 vs.\ 50.08~RPS). These gaps reflect PyTorch's mature, highly optimized kernel library (\texttt{libtorch}, cuDNN, MKL-DNN) rather than any fundamental language-level limitation. + +\subsubsection{Latency Analysis} + +Fig.~\ref{fig:latency} and Fig.~\ref{fig:percentiles} present the average latency and percentile distributions respectively. + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig6_latency_comparison.png} +\caption{Average inference latency (ms, log scale). For compute-heavy models (LSTM, Text Classification), the Python advantage grows substantially.} +\label{fig:latency} +\end{figure} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig7_percentile_dist.png} +\caption{Latency percentile distribution (P50/P95/P99) across all four model types. The tail-latency advantage of Python is most visible in the LSTM task, where P99 is 690~ms (Python) vs.\ 4200~ms (Rust)---a $6.1{\times}$ gap.} +\label{fig:percentiles} +\end{figure} + +Examining tail latency (P99) reveals that Rust's variance under load is higher than Python's for complex models. For LSTM, the P99 latency gap between Rust (4200~ms) and Python (690~ms) represents a $6.1{\times}$ difference---substantially larger than the mean latency gap, suggesting that under peak concurrent load the Rust LSTM service experiences more variance. For regression, P99 latencies are nearly identical (77~ms Python vs.\ 79~ms Rust), confirming parity for lightweight models. + +\FloatBarrier + +\subsection{Container Image Size} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig9_docker_sizes.png} +\caption{Docker container image sizes. Python GPU containers consistently occupy $\sim$4~GB; Rust containers occupy $\sim$1~GB, a $4{\times}$ reduction.} +\label{fig:docker_sizes} +\end{figure} + +\begin{table}[htbp] +\centering +\caption{Docker Container Image Size Comparison} +\label{tab:docker_sizes} +\begin{tabular}{|l|c|c|c|} +\hline +\textbf{Task} & \textbf{Python (GB)} & \textbf{Rust (GB)} & \textbf{Ratio} \\ +\hline +MNIST CNN & 3.98 & 1.02 & $3.9{\times}$ \\ +Text Classification & 4.02 & $\approx$1.00 & $4.0{\times}$ \\ +Regression & 3.98 & 0.97 & $4.1{\times}$ \\ +LSTM & 3.98 & 0.97 & $4.1{\times}$ \\ +\hline +\textbf{Mean} & \textbf{3.99} & \textbf{0.99} & $\mathbf{4.0{\times}}$ \\ +\hline +\end{tabular} +\end{table} + +The Python containers use the NVIDIA CUDA base image with a full GPU stack. The Rust containers use a minimal Vulkan-enabled runtime image with only the compiled binary and model weights. The $4{\times}$ size difference translates directly into reduced registry pull latency, lower storage costs, and faster Kubernetes pod startup times. + +\FloatBarrier + +\subsection{Model Artifact Size} + +\begin{figure}[htbp] +\centering +\includegraphics[width=\linewidth]{figures/fig8_model_sizes.png} +\caption{Serialized model artifact sizes (KB, log scale). For the regression model, both formats are compact (4~KB Rust vs.\ 5~KB Python). For large models, Rust is 45--48\% smaller.} +\label{fig:model_sizes} +\end{figure} + +\begin{table}[htbp] +\centering +\caption{Serialized Model Artifact Size Comparison} +\label{tab:model_sizes} +\begin{tabular}{|l|c|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk)} & \textbf{Python (.pt)} & \textbf{Reduction} \\ +\hline +MNIST CNN & 1.1~MB & 2.0~MB & 45.0\% \\ +Text Class. & 21~MB & 40.6~MB & 48.3\% \\ +Regression & 4~KB & 5~KB & 20.0\% \\ +LSTM & 60~KB & 120~KB & 50.0\% \\ +\hline +\end{tabular} +\end{table} + +For the regression model, both artifacts are negligibly small (4~KB Rust, 5~KB Python), as the 641-parameter model contains insufficient weight data for either format's overhead to dominate. The more operationally significant differences appear at scale: text classification serializes to 21~MB (Rust) versus 40.6~MB (Python) 48\% reduction driven by PyTorch's embedded optimizer state and Python pickle metadata. Burn's \texttt{CompactRecorder} stores only weight tensors in a flat binary encoding with no embedded metadata. + +\FloatBarrier + +%======================================================================= +\section{Discussion} +\label{sec:discussion} + +\subsection{Training Feasibility and Convergence} + +Track~1 results establish that Burn is a viable end-to-end training platform across all four evaluated architectures. Final validation accuracy on MNIST is $98.52\%$ (Rust, 95\% CI: $[98.36\%, 98.68\%]$) versus $98.20\%$ (Python, 95\% CI: $[98.04\%, 98.36\%]$). These confidence intervals do not overlap, indicating a small but statistically detectable difference in favor of Rust on this task, though the practical magnitude ($0.32\%$) is negligible. + +For text classification, Python achieves a higher macro F1 ($79.03\%$, CI: $[78.23\%, 79.83\%]$) than Rust ($76.51\%$, CI: $[75.87\%, 77.15\%]$), with non-overlapping confidence intervals. This difference while modest is statistically detectable and may reflect differences in weight initialization defaults or the interaction between the Noam scheduler and Burn's optimizer implementation. + +For LSTM, Rust achieves a substantially lower final validation loss ($17.59$, CI: $[14.73, 20.45]$) compared to Python ($48.92$, CI: $[43.19, 54.65]$). These intervals do not overlap, suggesting a genuine difference; however, this gap is most likely attributable to differences in synthetic data generation and normalization between the two pipelines rather than a fundamental framework advantage. + +Across all tasks, both frameworks converge without numerical instability. Zero NaN events were recorded in any run, confirming that Burn's automatic differentiation and gradient accumulation are numerically stable across the tested architectures. + +\subsection{The Inference Throughput Gap: Technical Explanation} + +The most counter-intuitive finding of this study is that Rust, despite eliminating Python's GIL and offering language-level concurrency guarantees, achieves lower inference throughput than Python for compute-intensive models. For LSTM, Python achieves $2.81\times$ higher RPS; for text classification, $2.97\times$ higher RPS. This result requires careful interpretation. + +The bottleneck in ML inference is not thread scheduling or request dispatching---it is tensor kernel throughput. PyTorch's \texttt{libtorch} backend integrates cuDNN for convolutions, MKL-DNN for CPU matrix operations, and fused attention kernels developed over years of production use. Burn's GPU backend, while architecturally sound, currently lacks: (1)~\emph{fused kernels} for multi-head attention, layer normalization, and LSTM gating, which PyTorch executes in a single GPU dispatch; (2)~\emph{cuDNN integration} providing hardware-specific convolution algorithms; and (3)~\emph{dynamic request batching} at the HTTP layer. + +The tail latency disparity amplifies this picture. For the LSTM task, Python's P99 latency is 690~ms versus Rust's 4200~ms, a $6.1\times$ gap substantially wider than the mean gap ($3.3\times$). This widening is consistent with GPU dispatch queue saturation under concurrent load: when GPU kernel dispatch lacks the throughput to clear the queue at the rate imposed by 100 concurrent users, requests accumulate and tail latency grows super-linearly. For regression, both achieve $\sim$585~RPS, confirming that the performance gap is a function of kernel maturity relative to model complexity, not a general property of the language or runtime. + +\subsection{Deployment Advantages: Quantified} + +Rust's deployment advantages are consistent across all four models. Container images average $0.99$~GB versus $3.99$~GB for Python a $4.0\times$ reduction translating to faster registry pulls, lower cold-start latency, and reduced storage costs~\cite{vasiliev2024scaling}. + +For the regression model, both artifacts are negligibly small (4~KB Rust, 5~KB Python), as the 641-parameter model contains insufficient weight data for either format's overhead to dominate. The more operationally significant differences appear at scale: text classification serializes to 21~MB (Rust) versus 40.6~MB (Python) 48\% reduction driven by PyTorch's embedded optimizer state and Python pickle metadata. + +\subsection{NFS Strategy: Scope and Limitations} + +The NFS-augmented deployment architecture addresses the dependency management challenge for Python inference containers. Rather than reducing the deployed image size, the GPU containers remain at 3.98--4.02~GB as the NVIDIA CUDA base image is required for accelerated inference the NFS approach eliminates redundant library installation across container instances and provides a single point of version control for ML dependencies. A CPU-only slim container using \texttt{python:3.12-slim} with NFS-mounted \texttt{torch} reduces the image to approximately 62--75~MB, at the cost of GPU acceleration. + +This strategy is most appropriate for controlled, on-premises deployments where NFS infrastructure can be maintained reliably. For public cloud or serverless deployments, it introduces a network dependency that may not be acceptable. Kubernetes-based deployments with NFS persistent volume claims represent a portable middle ground~\cite{k8sml2024}. + + +\subsection{Limitations} + +Several limitations bound the scope of the conclusions. First, all experiments use the Burn GPU backend; Burn also supports a LibTorch backend that would be expected to substantially close the inference throughput gap. Second, the load testing campaigns do not include request batching, which disproportionately benefits GPU-accelerated backends. Third, training comparisons do not include profiling traces, preventing direct attribution of training time differences to specific operations. Fourth, energy consumption and carbon footprint---increasingly relevant metrics for production ML systems~\cite{strubell2019energy}---are not measured. + +\subsection{Practical Decision Framework} + +Based on the empirical results, the following guidance is offered for practitioners. + +\textit{Choose Rust/Burn when:} deployment footprint is the primary constraint---edge, embedded, or bandwidth-limited environments where $<$1~GB containers and kilobyte-scale model artifacts are material requirements; compile-time tensor dimension safety is required to reduce production incidents from shape mismatches; or memory-safe, dependency-minimal binaries are required for security-critical deployments. + +\textit{Choose Python/PyTorch when:} maximum inference throughput is the primary requirement (demonstrated $3\times$ RPS advantage for sequential and attention-based models); lowest latency is essential (demonstrated $6\times$ P99 advantage for LSTM); or the Hugging Face and torchvision ecosystems are required for pre-trained model access. + +\textit{A hybrid strategy}---training in PyTorch for ecosystem richness and iteration speed, followed by weight export and reloading in a Rust/Burn inference service for deployment efficiency---may represent the best practical trade-off as Burn's cross-framework import capabilities mature. + +%======================================================================= +\section{Conclusion} +\label{sec:conclusion} + +This paper presented a structured, two-track empirical evaluation of Rust (Burn) and Python (PyTorch) for machine learning across training, inference deployment, and system-level characteristics. Four model architectures were evaluated---CNN, Transformer, LSTM, and feed-forward regression---with all experiments repeated eight times to report mean and variance. + +The principal findings are as follows. First, training accuracy is functionally equivalent between Burn and PyTorch across all evaluated tasks, establishing Rust as a viable end-to-end training platform. Second, Python inference services outperform Rust in throughput for compute-intensive models by up to $3.0{\times}$, and in P99 tail latency by up to $6.1{\times}$, a result driven by PyTorch's mature kernel library rather than any language-level property. Third, Rust provides a consistent $4{\times}$ reduction in container image size and 45--50\% smaller model artifacts for large models, with both regression artifacts being negligibly small at this scale. Fourth, an NFS-augmented deployment strategy for Python is described that decouples library dependencies from container images, applicable in controlled on-premises environments. + +These findings suggest that the two ecosystems are best understood as complementary rather than competing. Python/PyTorch dominates for throughput-sensitive serving and training workflows requiring ecosystem breadth. Rust/Burn is the superior choice for minimal-footprint, memory-safe, and edge-optimized deployment. As the Burn framework matures its GPU-accelerated backends---particularly LibTorch and native CUDA integration---the inference performance gap documented here is expected to narrow, potentially making Rust a viable primary platform across the full ML lifecycle. + +%======================================================================= +\begin{thebibliography}{00} + +\bibitem{paszke2019pytorch} +A. Paszke \textit{et al.}, ``PyTorch: An imperative style, high-performance deep learning library,'' in \textit{Adv. Neural Inf. Process. Syst.}, vol.~32, 2019. + +\bibitem{rust2023} +The Rust Programming Language Community, ``The Rust programming language,'' 2023. [Online]. Available: \url{https://www.rust-lang.org} + +\bibitem{burn2024} +N. Gagnon-Marchand \textit{et al.}, ``Burn: A flexible and comprehensive deep learning framework in Rust,'' GitHub, 2024. [Online]. Available: \url{https://github.com/tracel-ai/burn} + +\bibitem{vaswani2017attention} +A. Vaswani \textit{et al.}, ``Attention is all you need,'' in \textit{Adv. Neural Inf. Process. Syst.}, vol.~30, 2017. + +\bibitem{hochreiter1997lstm} +S. Hochreiter and J. Schmidhuber, ``Long short-term memory,'' \textit{Neural Computation}, vol.~9, no.~8, pp.~1735--1780, 1997. + +\bibitem{kingma2015adam} +D. P. Kingma and J. Ba, ``Adam: A method for stochastic optimization,'' in \textit{Proc. ICLR}, 2015. + +\bibitem{ba2016layernorm} +J. L. Ba, J. R. Kiros, and G. E. Hinton, ``Layer normalization,'' \textit{arXiv preprint arXiv:1607.06450}, 2016. + +\bibitem{merkel2014docker} +D. Merkel, ``Docker: Lightweight Linux containers for consistent development and deployment,'' \textit{Linux Journal}, vol.~2014, no.~239, 2014. + +\bibitem{gujarati2020clockwork} +A. Gujarati \textit{et al.}, ``Serving DNNs like clockwork: Performance predictability from the bottom up,'' in \textit{Proc. 14th USENIX OSDI}, 2020. + +\bibitem{locust} +J. Hamren \textit{et al.}, ``Locust: An open-source load testing tool,'' GitHub. [Online]. Available: \url{https://github.com/locustio/locust} + +\bibitem{bezanson2017julia} +J. Bezanson, A. Edelman, S. Karpinski, and V. B. Shah, ``Julia: A fresh approach to numerical computing,'' \textit{SIAM Review}, vol.~59, no.~1, pp.~65--98, 2017. + +\bibitem{onnxruntime} +Microsoft, ``ONNX Runtime: Cross-platform, high performance ML inferencing and training,'' 2021. [Online]. Available: \url{https://onnxruntime.ai} + +\bibitem{pytorchvstf2025} +Z. Ba Alawi, ``A comparative survey of PyTorch vs TensorFlow for deep learning: Usability, performance, and deployment trade-offs,'' \textit{arXiv preprint arXiv:2508.04035}, 2025. + +\bibitem{vella2023rust} +V. Vella, ``Boosting machine learning performance with Rust,'' \textit{Better Programming}, June 2023. [Online]. Available: \url{https://betterprogramming.pub/boosting-machine-learning-performance-with-rust-aab1f3ae1424} + +\bibitem{crespo2023rust} +J. Crespo, ``Supercharging machine learning performance: 4x improvement with Rust over Python,'' \textit{LinkedIn}, May 2023. + +\bibitem{athanx2024rust} +A. X. Athan, ``Choosing the right Rust machine learning framework: Candle, Burn, DFDX, or tch-rs?,'' \textit{Medium}, March 2024. + +\bibitem{li2024mobileinference} +Z. Li, M. Paolieri, and L. Golubchik, ``A benchmark for ML inference latency on mobile devices,'' in \textit{Proc. 7th Int. Workshop on Edge Systems, Analytics and Networking (EdgeSys)}, ACM, 2024. + +\bibitem{derosa2024modelserving} +P. De Rosa \textit{et al.}, ``On the cost of model-serving frameworks: An experimental evaluation,'' in \textit{Proc. IEEE Int. Conf. Cloud Engineering (IC2E)}, 2024, pp.~221--232. + +\bibitem{buyya2009cloud} +R. Buyya, C. S. Yeo, S. Venugopal, J. Broberg, and I. Brandic, ``Cloud computing and emerging IT platforms: Vision, hype, and reality for delivering computing as the 5th utility,'' \textit{Future Generation Computer Systems}, vol.~25, no.~6, pp.~599--616, 2009. + +\bibitem{vasiliev2024scaling} +G. Ramesh \textit{et al.}, ``A comprehensive review on scaling machine learning workflows using cloud technologies and DevOps,'' \textit{IEEE Access}, 2024. [Online]. Available: \url{https://ieeexplore.ieee.org/document/11126113} + +\bibitem{mldaas2024} +K. P. Ravikumar, N. Ahmed, and M. S. Singh, ``ML-DaaS: An integrated ML training and deployment framework for hybrid cloud,'' \textit{IEEE Access}, 2024. [Online]. Available: \url{https://ieeexplore.ieee.org/document/11261485} + +\bibitem{mohanty2024cicd} +M. V. Velumani and P. K. Muthukamatchi, ``Cloud-native software defect prediction: Leveraging machine learning models in scalable CI/CD pipelines,'' \textit{IEEE Access}, 2024. [Online]. Available: \url{https://ieeexplore.ieee.org/document/11212348} + +\bibitem{k8sml2024} +D. Kutsa, ``Using machine learning for analyzing performance metrics in Kubernetes,'' \textit{IJSRED}, vol.~8, no.~2, 2025. [Online]. Available: \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} + +\bibitem{strubell2019energy} +E. Strubell, A. Ganesh, and A. McCallum, ``Energy and policy considerations for deep learning in NLP,'' in \textit{Proc. ACL}, 2019. + +\bibitem{watada2019containers} +J. Watada \textit{et al.}, ``Emerging trends, techniques and open issues of containerization: A review,'' \textit{IEEE Access}, vol.~7, pp.~152443--152472, 2019. + +\end{thebibliography} +\EOD +\end{document} \ No newline at end of file diff --git a/latex_reports/lstm.tex b/latex_reports/lstm.tex new file mode 100644 index 0000000..fcf3814 --- /dev/null +++ b/latex_reports/lstm.tex @@ -0,0 +1,326 @@ +\documentclass[12pt]{article} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{minted} +\usepackage{geometry} +\geometry{a4paper, margin=1in} +\usepackage{pdfpages} + + +\title{\textbf{Comparative Analysis of LSTM Implementation: Rust (Burn) vs. PyTorch}} +\author{Technical Report} +\date{\today} + +\begin{document} + +\maketitle +\tableofcontents +\newpage + +\section{Introduction} +This document outlines the detailed architectural, mathematical, and translational specifics of implementing a Long Short-Term Memory (LSTM) model across two prominent machine learning environments: Rust (using the Burn framework) and Python (using PyTorch). It covers the model architecture, training pipelines, specialized deployment techniques using network filesystems (NFS) with Docker, and language-specific design implications. + +\section{Model Architecture and Mathematical Formulation} + +\subsection{Mathematical Foundation of the LSTM Cell} +The core of the model revolves around a custom, manually-implemented LSTM cell. Instead of relying on the standard un-inspectable black-box LSTM implementations provided by typical ML libraries, both codebases explicitly define the cell-level math. + +For a given timestep $t$, the input tensor $x_t$ and the previous hidden state $h_{t-1}$ are used to compute the various gates. The mathematical formulation utilized is: +\begin{align} + f_t &= \sigma(W_f \cdot [h_{t-1}, x_t] + b_f) \quad &\text{(Forget Gate)} \\ + i_t &= \sigma(W_i \cdot [h_{t-1}, x_t] + b_i) \quad &\text{(Input Gate)} \\ + g_t &= \tanh(W_g \cdot [h_{t-1}, x_t] + b_g) \quad &\text{(Candidate State)} \\ + o_t &= \sigma(W_o \cdot [h_{t-1}, x_t] + b_o) \quad &\text{(Output Gate)} +\end{align} +\begin{align} + c_t &= f_t \odot c_{t-1} + i_t \odot g_t \quad &\text{(New Cell State)} \\ + h_t &= o_t \odot \tanh(c_t) \quad &\text{(New Hidden State)} +\end{align} + +Where: +\begin{itemize} + \item $\sigma$ represents the Sigmoid activation function. + \item $\tanh$ represents the Hyperbolic Tangent activation function. + \item $\odot$ denotes element-wise multiplication (Hadamard product). + \item $[h_{t-1}, x_t]$ symbolizes the concatenation of the previous hidden state and the current input. +\end{itemize} + +\subsection{Architectural Details} +Both implementations adhere strictly to the following architectural design: +\begin{enumerate} + \item \textbf{Layer Normalization:} Pre-activation gates, the cell state ($c_t$), and the hidden state ($h_t$) pass through separate \texttt{LayerNorm} layers. This design choice stabilizes training dynamics since the feature distributions inside the LSTM evolve at every sequence step (making standard Batch Normalization ineffective). + \item \textbf{Optimized Gate Compute:} Instead of computing 4 separate linear transformations per timestep for the features, the model employs a single combined projection that outputs a $4 \times \text{hidden\_size}$ tensor. This tensor is subsequently split into four chunks corresponding to the $i, f, g$, and $o$ gates. + \item \textbf{Bidirectional Support:} An encapsulated \texttt{StackedLstm} module stacks multiple manual LSTM layers (applying dropout between layers except on the final one). The main \texttt{LstmNetwork} integrates a forward processing stack and an optional backward processing stack (which flips the temporal dimension of the input sequence). Their respective output hidden states are concatenated along the feature dimension before passing through a fully-connected projection head. + \item \textbf{Initialization bias:} The forget-gate bias parameters are explicitly initialized to $1.0$ (via Xavier Normal parameter slicing) to prevent fatal early-training gradient decay. +\end{enumerate} + + +\section{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/LSTM.pdf} + + + +\section{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/LSTM.pdf} + +\section{Training Pipeline} +The training behavior is intentionally synchronized to ensure parity between the languages: +\begin{itemize} + \item \textbf{Data Loading:} Operates synchronously on synthetically generated noisy sequential datasets. The validation set is scaled symmetrically relative to the training set ($20\%$ of training size). + \item \textbf{Optimization Algorithm:} Utilizes the Adam Optimizer. + \item \textbf{Loss Function:} Mean Squared Error (MSE), with reduction set to \textit{mean}. Both explicitly weigh loss accumulation during epoch passes by scaling local batch losses by the discrete batch size, averaging properly at the conclusion of the epoch. + \item \textbf{Gradient Clipping:} Ensures numerical stability on longer sequence inputs. The gradient norm is strictly clipped to $\max = 1.0$ right before the optimizer steps. + \item \textbf{Artifacts Output:} Training scripts generate an \texttt{artifact\_dir} where they store a \texttt{config.json} representation of hyperparameters, and the full state dictionary (\texttt{model.pt} in PyTorch; CompactRecorder files in Rust Burn). +\end{itemize} + +\section{Inference Pipeline and Docker NFS Integration} +\subsection{PyTorch Inference Architecture} +A critical requirement for modern PyTorch inference deployments is resolving the massive disk footprint of CUDA-enabled PyTorch backend libraries. The PyTorch pipeline employs a sophisticated Network File System (NFS) logic to achieve a highly optimized, lightweight Dockerized inference deployment: +\begin{enumerate} + \item \textbf{External Library Mounting:} A host-level script (\texttt{mount\_libs.sh}) maps an external NAS/NFS storage partition (from \texttt{172.16.203.14}) loaded with Python environments targeting \texttt{/mnt/LSTM-libs}. + \item \textbf{Optimized Dockerfile:} The image leverages the \texttt{nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04} base image and installs basic \texttt{python3.11} runtime headers without calling \texttt{pip install torch}. Thus, the final image size is structurally negligible compared to standard ml-images. + \item \textbf{Runtime Binding:} The inference container bootloader scripts (\texttt{run\_container.sh}) bind these volume mounts (\texttt{-v \$NFS\_MOUNT\_POINT:/external-libs}) and crucially overrides the \texttt{PYTHONPATH} env-variable: + \begin{verbatim} + -e PYTHONPATH="$CONTAINER_LIB_MOUNT/LSTM_env/lib/.../site-packages" + \end{verbatim} + \item \textbf{Inference Execution:} \texttt{app.py} loads the model weights off an abstracted configuration path, builds a zero-gradient loader, runs inference iteratively over a single collapsed batch, and yields predictions natively. +\end{enumerate} + +\subsection{Rust Inference Architecture} +Rust's inference pipeline diverges significantly regarding deployment complexity due to compilation structures: +\begin{itemize} + \item \textbf{Stateless Binaries:} No containerized runtime libraries are mandated because Burn compiles statically down to heavily optimized binaries, pulling model states directly via the \texttt{CompactRecorder}. + \item \textbf{Visualization:} Results are mapped into native Polars \texttt{DataFrame} objects (\texttt{df![]}) rendering lightweight native tables detailing \textit{expected targets} versus \textit{computed predictions}. +\end{itemize} + +\section{Implementation Specifics} +\subsection{PyTorch Specific Constraints} +\begin{itemize} + \item \textbf{Dynamic computation graphing:} The \texttt{model.py} cleanly slices and chunks gates natively on tensors (e.g., \texttt{gates.chunk(4, dim=1)}). + \item \textbf{Sequence Reversals:} Done programmatically via continuous \texttt{Tensor.flip(dims=[1])} which mandates that tensors must remain contiguously stored within PyTorch internals to avoid memory reallocation overhead. + \item \textbf{Seed Setting API:} Requires deterministic locking across four sub-systems (\texttt{random, numpy, torch, torch.cuda}) to match Rust's reproducibility parameters. +\end{itemize} + +\subsection{Rust (Burn) Specific Constraints} +\begin{itemize} + \item \textbf{Compile-Time Dimension Types:} Rust explicitly binds Tensor dimensionality at compile time (\texttt{Tensor} vs \texttt{Tensor}). This offers un-matched safety by forbidding invalid dimension injections that PyTorch would crash on dynamically. + \item \textbf{Trait Encapsulation:} Leverages explicit trait architectures (\texttt{\#[derive(Module, Config)]}) that automate saving hyperparameters and generating gradient backends. Burn models must be mapped cleanly from standard states to \texttt{autodiff} states. + \item \textbf{No-Mutation Logic:} State mutations generated sequentially in LSTMs are represented safely utilizing explicit tuple destructuring via \texttt{LstmState\{hidden, cell\}}, bypassing complex internal pointer tracking. + \item \textbf{Explicit Initialization Handling:} Since Burn limits orthogonal initializes out-of-the-box, Xavier Normalization was invoked explicitly, paired with \texttt{slice\_assign} tensor mappings to safely load the 1.0 uniform fill into the forget-gate components. +\end{itemize} + +\section{Rust: Training Loss Progression and Model Convergence} + +The model was trained for a total of 30 epochs. During training, both the training loss and validation loss decreased substantially, indicating that the model was able to learn meaningful patterns from the data. + +\subsection{Training Progress} + +The training process began with relatively high loss values. However, as training progressed, both the average training loss and average validation loss consistently decreased. + +The recorded loss values at different stages of training are shown below: + +\begin{table}[h] +\centering +\caption{Training and Validation Loss Progression} +\begin{tabular}{|c|c|c|} +\hline +\textbf{Epoch} & \textbf{Average Training Loss} & \textbf{Average Validation Loss} \\ +\hline +5 & 4456.9658 & 4473.4448 \\ +10 & 2510.1016 & 2438.3970 \\ +15 & 900.7457 & 801.6573 \\ +20 & 154.4127 & 164.8311 \\ +25 & 48.2149 & 20.1441 \\ +30 & 52.1122 & 17.5850 \\ +\hline +\end{tabular} +\label{tab:loss_progression} +\end{table} + +\subsection{Loss Trend Analysis} + +The training loss decreased from 4456.9658 at epoch 5 to 52.1122 at epoch 30. Similarly, the validation loss decreased from 4473.4448 to 17.5850 over the same period. + +This large reduction in both training and validation loss suggests that the model successfully converged during training. + +Although the training loss slightly increased between epoch 25 and epoch 30, the validation loss continued to decrease. This indicates that the model continued to improve its ability to generalize to unseen data. + +The lowest validation loss achieved during the experiment was: + +\[ +17.5850 +\] + +at epoch 30. + +\subsection{Generalization Performance} + +The close alignment between the training loss and validation loss throughout training suggests that the model did not suffer from severe overfitting. + +In the earlier epochs, both losses were very high, which is expected because the model parameters were still being optimized. As training continued, the losses dropped rapidly, especially between epochs 10 and 25. + +This behavior indicates that the model learned most of its predictive capability during the middle phase of training. + +\subsection{Execution Time} + +The complete training process required: + +\begin{itemize} + \item Real time: 6 minutes and 42.185 seconds + \item User CPU time: 6 minutes and 42.414 seconds + \item System CPU time: 3.49 seconds +\end{itemize} + +The relatively low system CPU time compared to user CPU time suggests that most of the runtime was spent performing model computation rather than operating system overhead. +\section{Python: LSTM Model Architecture and Training Performance} + +The Long Short-Term Memory (LSTM) model consists of a 2-layer bidirectional LSTM followed by a fully connected output layer. Dropout is applied between LSTM layers to improve generalization. + +\textbf{Model Architecture:} + + +The model was trained for 30 epochs. Training and validation performance improved significantly and consistently. + +Training loss decreased from 5543.18 to 56.11 (98.99\% reduction), while validation loss decreased from 5699.26 to 48.92. +The model achieved strong regression performance, with validation RMSE reaching 6.99 and MAE reaching 4.33. + +The $R^2$ score improved from negative values to 0.9517, indicating strong predictive capability. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 56.11 & 30 & 5543.18 & 1 \\ +Train & RMSE & 7.49 & 30 & 74.45 & 1 \\ +Train & MAE & 5.03 & 30 & 67.25 & 1 \\ +Train & R$^2$ & -4.41 & 1 & 0.9452 & 30 \\ +Train & Grad Norm (Total) & 524.61 & 1 & 4972.41 & 24 \\ +Train & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Train & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +Valid & Loss & 47.96 & 29 & 5699.26 & 1 \\ +Valid & RMSE & 6.93 & 29 & 75.49 & 1 \\ +Valid & MAE & 3.54 & 28 & 68.51 & 1 \\ +Valid & R$^2$ & -4.63 & 1 & 0.9526 & 29 \\ +Valid & Iteration Speed (it/s) & 5.02 & 29 & 6.94 & 4 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.13 & -- \\ +Valid & CPU Usage (\%) & 72.4 & -- & 88.8 & -- \\ +\hline + +\end{tabular} +\caption{Extended LSTM Training and Validation Metrics Summary} +\end{table} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 158.63 seconds + \item Average Epoch Time: 5.18 seconds + \item Iteration Speed (Mean): 6.22 it/s + \item Gradient Norm (Mean): 1394.59 + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 974MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + +\end{document} diff --git a/latex_reports/main.tex b/latex_reports/main.tex new file mode 100644 index 0000000..1c10156 --- /dev/null +++ b/latex_reports/main.tex @@ -0,0 +1,227 @@ +\documentclass[11pt,a4paper]{article} + +\usepackage{geometry} +\geometry{margin=1in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} + +\title{\textbf{Progress Report: System-Level Evaluation of Rust and Python for Machine Learning}} +\author{Project Elective} +\date{\today} + +\begin{document} + +\maketitle + +\section{Overview of the Project} + +This project studies the use of \textbf{Rust} as an alternative systems language for machine learning workflows traditionally implemented in \textbf{Python}. +Rather than focusing on state-of-the-art model performance, the emphasis is on: + +\begin{itemize} + \item feasibility of end-to-end ML workflows, + \item system stability and reproducibility, + \item developer experience and DevOps complexity, + \item deployment and operational characteristics. +\end{itemize} + +To ensure clarity and rigor, the work is organized into \textbf{two clearly separated experimental tracks}. + +--- + +\section{Project Structure: Two-Track Evaluation} + +The project consists of the following two tracks: + +\subsection*{Track 1: Training-Based Systems Evaluation} +This track compares \textbf{machine learning training pipelines} implemented in: +\begin{itemize} + \item PyTorch (Python), and + \item Burn (Rust). +\end{itemize} + +The goal is to evaluate training feasibility, stability, compile-time guarantees, and DevOps impact, rather than raw training speed. + +\subsection*{Track 2: Inference-Based DevOps Evaluation} +This track compares \textbf{production-style inference services} implemented in: +\begin{itemize} + \item Python-based ONNX inference, and + \item Rust-based ONNX inference. +\end{itemize} + +The focus is on deployment, security, containerization, CI/CD behavior, and runtime efficiency. + +Each track is designed to answer a distinct research question while remaining complementary. + +--- + +\section{Machine Learning Tasks Considered} + +To ensure coverage of diverse ML workloads, the following tasks are identified: + +\begin{itemize} + \item \textbf{Text Classification}: Dataset to be finalized. + \item \textbf{Image Classification}: MNIST dataset. + \item \textbf{Credit Score Assignment}: Supervised classification task. + \item \textbf{Multi-Objective Machine Learning}: Brain Tumor dataset with a MOML formulation. + \item \textbf{Fine-Tuning Task}: BERT-based classification (ANLP Assignment 1), with optional LoRA / QLoRA. + \item \textbf{Autoregressive Decoding}: Experiments using the Burn framework. +\end{itemize} + +At the current stage, the \textbf{MNIST image classification task has been fully implemented}. +The corresponding training code is available in the project GitHub repository. + +--- + +\section{Related Work} + +The following research papers are being used to guide experimental design and evaluation: + +\begin{itemize} + \item \url{https://ieeexplore.ieee.org/document/11126113} + \item \url{https://ieeexplore.ieee.org/document/11261485} + \item \url{https://ieeexplore.ieee.org/document/11212348} + \item \url{https://www.ijsred.com/volume8/issue2/IJSRED-V8I2P143.pdf} +\end{itemize} + +--- + +\section{Code Repository and Current Status} + +Project repository: +\begin{center} +\url{https://github.com/Abhinav-Kumar012/Rust_Python_ML_PE.git} +\end{center} + +Current progress includes: +\begin{itemize} + \item MNIST training pipeline implemented + \item PyTorch baseline established + \item Initial Rust (Burn) training setup completed +\end{itemize} + +--- + +\section{Track 1: Training-Based Systems Evaluation} + +\subsection{Objective} + +The objective of this track is to answer the following research question: + +\begin{quote} +\textit{Can Rust realistically support end-to-end machine learning training pipelines, and what system-level trade-offs does this introduce compared to PyTorch?} +\end{quote} + +This track explicitly avoids speed-centric benchmarking and instead focuses on system behavior. + +--- + +\subsection{Frameworks Compared} + +\subsubsection{PyTorch (Baseline)} +\begin{itemize} + \item Language: Python + \item Training maturity: Very high + \item Ecosystem: Extensive +\end{itemize} + +\subsubsection{Rust (Burn)} +\begin{itemize} + \item Language: Rust + \item Training maturity: Emerging + \item Design: Idiomatic Rust, native training support +\end{itemize} + +--- + +\subsection{Experimental Controls} + +\textbf{Fixed Across Both Implementations} +\begin{itemize} + \item Dataset splits + \item Number of epochs + \item Batch size + \item Optimizer type + \item Learning rate + \item Hardware +\end{itemize} + +\textbf{Allowed Differences} +\begin{itemize} + \item Internal kernel implementations + \item Graph execution model + \item Memory management +\end{itemize} + +--- + +\subsection{Metrics Collected} + +\begin{itemize} + \item Training time per epoch (reported cautiously) + \item Loss curves and convergence behavior + \item Runtime failures and numerical stability + \item Reproducibility across runs + \item Environment setup and build complexity + \item Dependency footprint and artifact size +\end{itemize} + +--- + +\section{Track 2: Inference-Based DevOps Evaluation} + +\subsection{Objective} + +The objective of this track is to compare \textbf{deployment, security, and operational characteristics} of Python-based and Rust-based ML inference services executing the same ONNX model. + +--- + +\subsection{Inference Services Compared} + +\textbf{Python Service} +\begin{itemize} + \item FastAPI + Uvicorn + \item ONNX Runtime (Python) +\end{itemize} + +\textbf{Rust Service} +\begin{itemize} + \item Axum / Actix + \item burn-rs +\end{itemize} + +Both services expose identical inference endpoints and return identical outputs. + +--- + +\subsection{Evaluation Dimensions} + +\begin{itemize} + \item CI/CD build behavior + \item Container image size and layering + \item Cold-start latency + \item Inference latency and throughput + \item Resource utilization + \item Security and supply-chain surface +\end{itemize} + +--- + +\section{Upcoming Work} + +The following tasks are planned for the next phase of the project: +s +\begin{itemize} + \item Develop production-style inference services for both Python and Rust. + \item Write Dockerfiles for Python and Rust inference services. + \item Set up Jenkins-based CI pipelines for inference, including build, test, containerization, and security scanning. +\end{itemize} + + +\end{document} diff --git a/latex_reports/merge_script.py b/latex_reports/merge_script.py new file mode 100644 index 0000000..eddcfac --- /dev/null +++ b/latex_reports/merge_script.py @@ -0,0 +1,132 @@ +import os +import re + +directory = r"c:\Users\Valmik Belgaonkar\OneDrive\Desktop\Rust_Python_ML_PE\latex_reports" + +files = { + 'main': os.path.join(directory, 'main.tex'), + 'mnist': os.path.join(directory, 'mnist.tex'), + 'regression': os.path.join(directory, 'regression.tex'), + 'text': os.path.join(directory, 'text_classification_news.tex'), + 'lstm': os.path.join(directory, 'lstm.tex') +} + +out_file = os.path.join(directory, 'combined_research_paper.tex') + +def extract_body(file_path): + if not os.path.exists(file_path): + print(f"Warning: {file_path} not found.") + return "" + + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Extract everything between \begin{document} and \end{document} + match = re.search(r'\\begin\{document\}(.*?)\\end\{document\}', content, re.DOTALL) + if match: + body = match.group(1) + # Remove \maketitle, \tableofcontents, \newpage as we will have a unified one + body = re.sub(r'\\maketitle', '', body) + body = re.sub(r'\\tableofcontents', '', body) + # Shift sections down since these will be nested under a main Task section, EXCEPT for main.tex + if 'main.tex' not in file_path: + body = body.replace('\\subsubsection{', '\\paragraph{') + body = body.replace('\\subsection{', '\\subsubsection{') + body = body.replace('\\section{', '\\subsection{') + + # Also catch the asterisk versions like \section*{...} + body = body.replace('\\subsubsection*{', '\\paragraph*{') + body = body.replace('\\subsection*{', '\\subsubsection*{') + body = body.replace('\\section*{', '\\subsection*{') + return body.strip() + return "" + +def process_tables_for_twocolumn(body): + # Change \begin{table}[*] to \begin{table*}[*] to span two columns + body = re.sub(r'\\begin\{table\}\[.*?\]', r'\\begin{table*}[t!]', body) + body = re.sub(r'\\begin\{table\}', r'\\begin{table*}[t!]', body) + body = re.sub(r'\\end\{table\}', r'\\end{table*}', body) + return body + +print("Extracting contents...") +main_body = extract_body(files['main']) +mnist_body = extract_body(files['mnist']) +regression_body = extract_body(files['regression']) +text_body = extract_body(files['text']) +lstm_body = extract_body(files['lstm']) + +# The user wants twocolumn, so we use table* to ensure wide tables don't break. +mnist_body = process_tables_for_twocolumn(mnist_body) +regression_body = process_tables_for_twocolumn(regression_body) +text_body = process_tables_for_twocolumn(text_body) +lstm_body = process_tables_for_twocolumn(lstm_body) +main_body = process_tables_for_twocolumn(main_body) + +preamble = r"""\documentclass[10pt,a4paper,twocolumn]{article} + +\usepackage{geometry} +\geometry{margin=0.75in, columnsep=0.25in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} +\usepackage{listings} +\usepackage{xcolor} +\usepackage{pdfpages} +\usepackage{minted} + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]", +} + +\title{\textbf{System-Level Evaluation of Rust and Python for Machine Learning}} +\author{Project Elective} +\date{\today} + +\begin{document} + +\maketitle +""" + +postamble = r""" +\end{document} +""" + +print(f"Writing combined file to {out_file}...") +with open(out_file, 'w', encoding='utf-8') as f: + f.write(preamble) + + # Write main.tex content + f.write(main_body) + f.write("\n\n\\clearpage\n\n") + + # Write MNIST content + f.write(r"\section{Task: MNIST Image Classification}" + "\n") + f.write(mnist_body) + f.write("\n\n\\clearpage\n\n") + + # Write Regression content + f.write(r"\section{Task: Regression}" + "\n") + f.write(regression_body) + f.write("\n\n\\clearpage\n\n") + + # Write Text Classification content + f.write(r"\section{Task: Text Classification (AG News)}" + "\n") + f.write(text_body) + f.write("\n\n\\clearpage\n\n") + + # Write LSTM content + f.write(r"\section{Task: LSTM implementation}" + "\n") + f.write(lstm_body) + f.write("\n\n\\clearpage\n\n") + + f.write(postamble) + +print("Combined file created successfully!") diff --git a/latex_reports/mnist.tex b/latex_reports/mnist.tex new file mode 100644 index 0000000..79b9052 --- /dev/null +++ b/latex_reports/mnist.tex @@ -0,0 +1,685 @@ +\documentclass[11pt,a4paper]{article} + +\usepackage{geometry} +\geometry{margin=1in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} +\usepackage{listings} +\usepackage{xcolor} +\usepackage{pdfpages} + + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]", +} + +\title{\textbf{MNIST}} +\date{\today} + +\begin{document} + +\maketitle + +\section{Architecture Details} + +\begin{table}[h!] +\centering +\renewcommand{\arraystretch}{1.3} +\begin{tabular}{|c|l|l|l|l|} +\hline +\textbf{Step} & \textbf{Layer} & \textbf{Configuration} & \textbf{Input Shape} & \textbf{Output Shape} \\ +\hline + +1 & Input & Grayscale Images & +$[B, H, W]$ & +$[B, H, W]$ \\ + +\hline +2 & Reshape & Add channel dimension & +$[B, H, W]$ & +$[B, 1, H, W]$ \\ + +\hline +3 & Conv2D (conv1) & +$1 \rightarrow 8$, kernel $3 \times 3$ & +$[B, 1, H, W]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +4 & Dropout & +$p = 0.5$ & +$[B, 8, H-2, W-2]$ & +$[B, 8, H-2, W-2]$ \\ + +\hline +5 & Conv2D (conv2) & +$8 \rightarrow 16$, kernel $3 \times 3$ & +$[B, 8, H-2, W-2]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +6 & Dropout & +$p = 0.5$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +7 & ReLU & +Activation & +$[B, 16, H-4, W-4]$ & +$[B, 16, H-4, W-4]$ \\ + +\hline +8 & Adaptive Avg Pool & +Output size $8 \times 8$ & +$[B, 16, H-4, W-4]$ & +$[B, 16, 8, 8]$ \\ + +\hline +9 & Flatten & +$16 \times 8 \times 8$ & +$[B, 16, 8, 8]$ & +$[B, 1024]$ \\ + +\hline +10 & Linear (fc1) & +$1024 \rightarrow \texttt{hidden\_size}$ & +$[B, 1024]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +11 & Dropout & +$p = 0.5$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +12 & ReLU & +Activation & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{hidden\_size}]$ \\ + +\hline +13 & Linear (fc2) & +$\texttt{hidden\_size} \rightarrow \texttt{num\_classes}$ & +$[B, \texttt{hidden\_size}]$ & +$[B, \texttt{num\_classes}]$ \\ + +\hline +\end{tabular} +\caption{Detailed architecture of the convolutional neural network implemented in Burn. +$B$ denotes batch size, $H$ and $W$ denote input image height and width respectively.} +\label{tab:burn-cnn-architecture} +\end{table} + +\noindent\textbf{Notes:} +\begin{itemize} + \item All convolution layers use default stride = 1 and no padding. + \item Dropout probability is configurable via \texttt{ModelConfig.dropout}. + \item Adaptive average pooling ensures a fixed spatial resolution regardless of input size. + \item The model is fully differentiable and backend-agnostic via Burn's \texttt{Backend} trait. +\end{itemize} + + +\section{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/MNIST.pdf} + + +\section{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/MNIST.pdf} + + +\section{Rust: Dockerfile Design and Containerization Strategy} + +The Dockerfile used for the Rust-based MNIST inference application follows a multi-stage build strategy. Multi-stage builds are commonly used to reduce the size of the final container image by separating the compilation environment from the runtime environment. + +\subsection{Overview of Multi-Stage Build} + +The Dockerfile is divided into two major stages: + +\begin{enumerate} + \item Builder Stage + \item Runtime Stage +\end{enumerate} + +The builder stage is responsible for compiling the Rust application, while the runtime stage contains only the compiled binary and the required runtime dependencies. + +\subsection{Builder Stage} + +The first stage begins with: + +\begin{verbatim} +FROM ubuntu:16.04 AS builder +\end{verbatim} + +This instruction uses Ubuntu 16.04 as the base image for building the Rust application. The alias \texttt{builder} is assigned to this stage so that its outputs can later be referenced in the runtime stage. + +\subsubsection{Working Directory} + +\begin{verbatim} +WORKDIR /app/rust_ml +\end{verbatim} + +The \texttt{WORKDIR} instruction sets the default working directory inside the container to: + +\begin{verbatim} +/app/rust_ml +\end{verbatim} + +All subsequent commands in the builder stage are executed relative to this directory. + +\subsubsection{Installing Build Dependencies} + +The following command installs the required packages for compiling the Rust project: + +\begin{verbatim} +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* +\end{verbatim} + +Each package serves a specific purpose: + +\begin{itemize} + \item \texttt{curl}: Used to download external files, including the Rust installation script. + \item \texttt{build-essential}: Provides common compilation tools such as \texttt{gcc}, \texttt{g++}, and \texttt{make}. + \item \texttt{pkg-config}: Helps discover system libraries during the build process. + \item \texttt{ca-certificates}: Ensures secure HTTPS communication when downloading dependencies. +\end{itemize} + +The final cleanup command: + +\begin{verbatim} +rm -rf /var/lib/apt/lists/* +\end{verbatim} + +removes cached package lists to reduce image size. + +\subsubsection{Installing Rust} + +Rust is installed using the official Rust installer: + +\begin{verbatim} +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +\end{verbatim} + +This command downloads and executes the \texttt{rustup} installer. + +The flags used have the following meanings: + +\begin{itemize} + \item \texttt{--proto '=https'}: Restricts downloads to HTTPS only. + \item \texttt{--tlsv1.2}: Forces the use of TLS version 1.2 for secure transport. + \item \texttt{-sSf}: Makes \texttt{curl} silent while still showing errors if the download fails. + \item \texttt{-y}: Automatically accepts all installation prompts. +\end{itemize} + +After Rust is installed, the PATH environment variable is updated: + +\begin{verbatim} +ENV PATH="/root/.cargo/bin:${PATH}" +\end{verbatim} + +This ensures that Rust tools such as \texttt{cargo} and \texttt{rustc} are available in subsequent commands. + +\subsubsection{Copying Source Code} + +\begin{verbatim} +COPY . . +\end{verbatim} + +This instruction copies the entire project directory from the host system into the current working directory inside the container. + +\subsubsection{Building the Application} + +\begin{verbatim} +RUN cargo build --release -p mnist_infer +\end{verbatim} + +This command compiles the Rust project in release mode. + +The options used are: + +\begin{itemize} + \item \texttt{--release}: Builds the application with compiler optimizations enabled. + \item \texttt{-p mnist\_infer}: Specifies that only the \texttt{mnist\_infer} package should be compiled. +\end{itemize} + +The generated executable is stored in: + +\begin{verbatim} +/app/rust_ml/target/release/mnist_infer +\end{verbatim} + +\subsection{Runtime Stage} + +The second stage begins with: + +\begin{verbatim} +FROM nvidia/vulkan:1.3-470 +\end{verbatim} + +This stage uses an NVIDIA Vulkan runtime image as the base image. The purpose of using this image is to provide Vulkan-related runtime libraries and GPU compatibility for applications that may rely on Vulkan acceleration. + +Compared to the builder image, this runtime image is significantly smaller because it does not contain compilation tools, Rust compilers, or source code. + + +\subsubsection{Runtime Working Directory} + +\begin{verbatim} +WORKDIR /app +\end{verbatim} + +This sets the runtime working directory to: + +\begin{verbatim} +/app +\end{verbatim} + +All runtime files are placed relative to this location. + +\subsubsection{Copying the Compiled Binary} + +\begin{verbatim} +COPY --from=builder /app/rust_ml/target/release/mnist_infer /app/binary +\end{verbatim} + +This instruction copies the compiled executable from the builder stage into the runtime image. + +The \texttt{--from=builder} option tells Docker to retrieve the file from the stage named \texttt{builder}. + +The binary is renamed from: + +\begin{verbatim} +mnist_infer +\end{verbatim} + +to: + +\begin{verbatim} +/app/binary +\end{verbatim} + +inside the runtime container. + +\subsubsection{Copying the Model File} + +\begin{verbatim} +COPY ./model/mnist_rust/model.mpk /app/model/mnist_rust/model.mpk +\end{verbatim} + +This instruction copies the trained model file into the runtime container. + +The model file is stored at: + +\begin{verbatim} +/app/model/mnist_rust/model.mpk +\end{verbatim} + +The application can later load this file during inference. + +\subsubsection{Environment Variables} + +Two environment variables are defined: + +\begin{verbatim} +ENV RUST_LOG=info +ENV MODEL_PATH=/app/model/mnist_rust/model.mpk +\end{verbatim} + +Their purposes are: + +\begin{itemize} + \item \texttt{RUST\_LOG=info}: Enables logging at the info level. + \item \texttt{MODEL\_PATH}: Stores the path to the trained model file. +\end{itemize} + +Using environment variables makes the application more flexible because configuration values can be changed without modifying the source code. + +\subsubsection{Exposing the Application Port} + +\begin{verbatim} +EXPOSE 9050 +\end{verbatim} + +This instruction documents that the containerized application listens on port 9050. + +Although \texttt{EXPOSE} does not automatically publish the port to the host system, it informs users and orchestration tools such as Docker Compose or Kubernetes which port should be mapped. + +\subsubsection{Container Startup Command} + +\begin{verbatim} +CMD ["./binary"] +\end{verbatim} + +This instruction defines the default command executed when the container starts. + +The compiled Rust binary is launched directly from the runtime working directory. + +\subsection{Advantages of the Dockerfile Design} + +This Dockerfile provides several important advantages: + +\begin{itemize} + \item Reduced final image size through multi-stage builds. + \item Separation of build dependencies and runtime dependencies. + \item Improved security because the runtime image does not contain compilers or source code. + \item Faster deployment due to a lightweight runtime container. + \item Better portability because the same container can run consistently across different environments. + \item Easier maintenance through the use of environment variables and explicit working directories. +\end{itemize} + +Overall, this Dockerfile is designed to efficiently package the Rust-based MNIST inference application for deployment while minimizing runtime overhead and maintaining reproducibility. + +\section{Python (PyTorch) Dockerfile} + +This section details the image optimization strategy implemented for the MNIST inference container. The core approach minimizes the Docker image size by decoupling the heavy machine learning dependencies (PyTorch, etc.) from the application container. Instead of baking these libraries into the image, they are stored on an external volume (NFS share) and mounted at runtime. + +\subsection{Dockerfile Analysis} + +The \texttt{Dockerfile} is kept intentionally lightweight. By excluding large dependencies like \texttt{torch} from the \texttt{pip install} command, the image size remains very small (only containing the base Python runtime and lightweight web frameworks). + +\begin{lstlisting}[language=Dockerfile, caption={Optimized Inference Dockerfile}, label={lst:dockerfile_inference}] +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV OMP_NUM_THREADS=1 +ENV MKL_NUM_THREADS=1 + +# Critical: Point Python to the external volume +ENV PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages + +WORKDIR /app + +# Only install lightweight app dependencies +RUN pip install --no-cache-dir --upgrade pip && \ + pip install fastapi==0.110.0 uvicorn==0.29.0 python-multipart==0.0.9 + +COPY app.py model.py model.pt ./ + +EXPOSE 8000 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] +\end{lstlisting} + +\begin{itemize} + \item \textbf{Base Image:} Uses \texttt{python:3.12-slim} to minimise the OS footprint. + \item \textbf{Environment Configuration:} + \begin{itemize} + \item \texttt{PYTHONDONTWRITEBYTECODE=1}: Prevents Python from writing \texttt{.pyc} files to disk. + \item \textbf{\texttt{PYTHONPATH}}: Crucially set to \texttt{/external-libs/ml\_env/lib/python3.12/site-packages}. This instructs the Python interpreter to look for libraries in the mounted volume directory, not just the default system paths. + \end{itemize} + \item \textbf{Minimal Dependencies:} The \texttt{pip install} command only installs \texttt{fastapi}, \texttt{uvicorn}, and \texttt{python-multipart}. Heavy ML libraries are assumed to be present in the mounted volume. +\end{itemize} + +\subsection{Volume Mounting Strategy} + +The strategy relies on two shell scripts to set up the environment on the host machine and run the container with the correct volume mappings. + +\subsubsection{Library Setup (\texttt{mount\_libs.sh})} +This script runs on the host machine (or a VM node) to prepare the shared library volume. +\begin{enumerate} + \item \textbf{NFS Client Installation:} It installs \texttt{nfs-common} to enable Network File System capabilities. + \item \textbf{Mounting:} It connects to a remote NFS server (\texttt{172.16.203.14}) where the pre-installed ML libraries reside. + \item \textbf{Local Path:} The remote libraries are mounted to \texttt{/mnt/ml-libs} on the host. This directory acts as the bridge between the NFS server and the Docker container. +\end{enumerate} + +\subsubsection{Runtime Execution (\texttt{run\_container.sh})} +This script launches the Docker container with the necessary runtime configurations to access the external libraries. + +\begin{lstlisting}[language=Bash, caption={Container Execution Command}] +docker run -d \ + -v /mnt/ml-libs:/external-libs \ + -e PYTHONPATH=/external-libs/ml_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + fastapi-ml-app +\end{lstlisting} + +\begin{itemize} + \item \textbf{\texttt{-v /mnt/ml-libs:/external-libs}}: This bind mount maps the host's \texttt{/mnt/ml-libs} (which contains the NFS data) to \texttt{/external-libs} inside the container. + \item \textbf{\texttt{-e PYTHONPATH=...}}: explicit environment variable override ensures the container's Python runtime finds the packages in \texttt{/external-libs}. +\end{itemize} + +\subsection{Benefits and Optimization} + +\begin{table}[h!] +\centering +\caption{Optimization Benefits} +\label{tab:docker_optimization} +\begin{tabular}{|l|p{6cm}|p{6cm}|} +\hline +\textbf{Feature} & \textbf{Standard Approach} & \textbf{Volume Mount Approach} \\ \hline +\textbf{Image Size} & \textbf{Huge} ($>2GB$). Includes PyTorch, CUDA binaries, and all dependencies. & \textbf{Tiny} (~100MB). Only contains app code and minimal HTTP libs. \\ \hline +\textbf{Build Time} & \textbf{Slow}. Downloading and installing PyTorch takes minutes. & \textbf{Fast}. setup only installs \texttt{fastapi}. \\ \hline +\textbf{Updates} & requires rebuilding and pushing large layers for every code change. & Code changes only require rebuilding the tiny app layer. Library updates are handled externally. \\ \hline +\end{tabular} +\end{table} + +This architecture allows for rapid deployment and updating of the application logic without the overhead of moving gigabytes of container layers for unchanged machine learning dependencies. + +\section{Rust: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model used for the experiment consisted of two convolutional layers followed by adaptive average pooling, dropout, and two fully connected layers. The complete architecture is shown below: + +\begin{verbatim} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 80} + conv2: Conv2d {ch_in: 8, ch_out: 16, stride: [1, 1], kernel_size: [3, 3], dilation: [1, 1], groups: 1, padding: Valid, params: 1168} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, bias: true, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, bias: true, params: 5130} + activation: Relu + params: 531178 +} +\end{verbatim} + +The model was trained for 10 epochs. Over the course of training, both the training and validation performance improved consistently. Training accuracy increased from 81.575\% in the first epoch to 97.300\% in the final epoch, while validation accuracy improved from 92.133\% to 98.517\%. + +Similarly, the training loss decreased significantly from 0.656 to 0.087, and the validation loss reduced from 0.258 to 0.054 by the end of training. The macro F1-score also improved substantially, reaching 96.974\% for training and 98.321\% for validation. + +\begin{table}[h] +\centering +\caption{Training and Validation Metrics Summary} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 81.575 & 1 & 97.300 & 10 \\ +Train & Loss & 0.087 & 10 & 0.656 & 1 \\ +Train & Precision@Top1 [Macro] & 82.126 & 1 & 97.304 & 10 \\ +Train & Recall@Top1 [Macro] & 81.286 & 1 & 97.232 & 10 \\ +Train & F1-Score@Top1 [Macro] & 79.715 & 1 & 96.974 & 10 \\ +Train & Top-5 Accuracy & 97.696 & 1 & 99.969 & 10 \\ +Train & CPU Memory (GB) & 2.514 & 2 & 2.927 & 10 \\ +Train & CPU Usage (\%) & 20.753 & 5 & 30.394 & 10 \\ +\hline +Valid & Accuracy & 92.133 & 1 & 98.517 & 10 \\ +Valid & Loss & 0.054 & 10 & 0.258 & 1 \\ +Valid & Precision@Top1 [Macro] & 92.154 & 1 & 98.527 & 10 \\ +Valid & Recall@Top1 [Macro] & 91.978 & 1 & 98.425 & 10 \\ +Valid & F1-Score@Top1 [Macro] & 91.176 & 1 & 98.321 & 10 \\ +Valid & Top-5 Accuracy & 99.583 & 1 & 99.967 & 10 \\ +Valid & CPU Memory (GB) & 2.514 & 2 & 3.085 & 10 \\ +Valid & CPU Usage (\%) & 20.539 & 2 & 39.652 & 10 \\ +\hline +\end{tabular} +\label{tab:cnn_metrics_summary} +\end{table} + +The results indicate that the model achieved strong generalization performance with minimal overfitting, as the validation accuracy remained slightly higher than the training accuracy throughout the experiment. The consistently high Top-5 accuracy values further demonstrate that the model was able to correctly identify the correct class within its top predictions. + +It should also be noted that the execution terminated with a segmentation fault after training completion. However, since the fault occurred after all epochs had been completed and metrics had already been recorded, it did not affect the validity of the training results. \\ + +The time taken to train the model is 229.916s (3min 49.916s). + + + +\section{Python: CNN Model Architecture and Training Performance} + +The convolutional neural network (CNN) model implemented in Python mirrors the Rust architecture, consisting of two convolutional layers followed by adaptive average pooling, dropout regularization, and two fully connected layers. + +\textbf{Model Architecture:} + +\begin{verbatim} +Model { + conv1: Conv2d {ch_in: 1, ch_out: 8, kernel_size: [3, 3], stride: [1, 1]} + conv2: Conv2d {ch_in: 8, ch_out: 16, kernel_size: [3, 3], stride: [1, 1]} + pool: AdaptiveAvgPool2d {output_size: [8, 8]} + dropout: Dropout {prob: 0.5} + linear1: Linear {d_input: 1024, d_output: 512, params: 524800} + linear2: Linear {d_input: 512, d_output: 10, params: 5130} + activation: ReLU + total params: 531178 +} +\end{verbatim} + +The model was trained for 10 epochs. Training and validation performance improved consistently over time. + +Training accuracy increased from 82.01\% to 97.29\%, while validation accuracy improved from 92.87\% to 98.20\%. +Training loss decreased significantly from 0.5947 to 0.0867, and validation loss reduced from 0.2475 to 0.0579. + +The macro F1-score reached 0.9814, demonstrating strong classification performance. Additionally, the Top-5 accuracy achieved 99.98\%, indicating highly reliable predictions. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 82.01 & 1 & 97.29 & 10 \\ +Train & Loss & 0.0867 & 10 & 0.5947 & 1 \\ +Train & Precision@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Recall@Top1 [Macro] & -- & -- & -- & -- \\ +Train & F1-Score@Top1 [Macro] & -- & -- & -- & -- \\ +Train & Top-5 Accuracy & -- & -- & -- & -- \\ +Train & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Train & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +Valid & Accuracy & 92.87 & 1 & 98.20 & 10 \\ +Valid & Loss & 0.0579 & 10 & 0.2475 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.9816 & -- & 0.9816 & -- \\ +Valid & Recall@Top1 [Macro] & 0.9812 & -- & 0.9812 & -- \\ +Valid & F1-Score@Top1 [Macro] & 0.9814 & -- & 0.9814 & -- \\ +Valid & Top-5 Accuracy & 99.98 & -- & 99.98 & -- \\ +Valid & CPU Memory (GB) & 0.84 & -- & 0.98 & -- \\ +Valid & CPU Usage (\%) & 72.1 & -- & 92.0 & -- \\ +\hline + +\end{tabular} +\caption{Python Training and Validation Metrics Summary} +\end{table} + +The results indicate strong generalization performance with no signs of overfitting. Validation accuracy remained consistently high and closely followed training accuracy. + +Training was stable with zero NaN events observed. The total training time was 182.23 seconds, with an average epoch time of 15.05 seconds and an average iteration speed of 50.58 iterations per second. + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 1.02GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + +\end{document} diff --git a/latex_reports/regression.tex b/latex_reports/regression.tex new file mode 100644 index 0000000..2b18f1b --- /dev/null +++ b/latex_reports/regression.tex @@ -0,0 +1,333 @@ +\documentclass[12pt]{article} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{graphicx} +\usepackage{hyperref} +\usepackage{geometry} +\geometry{a4paper, margin=1in} +\usepackage{pdfpages} + + +\title{\textbf{Comparative Analysis of Regression Implementations: Rust (Burn) vs. PyTorch}} +\author{Technical Report} +\date{\today} + +\begin{document} + +\maketitle +\tableofcontents +\newpage + +\section{Introduction} +This document outlines the architectural, mathematical, and deployment specifics of implementing a Neural Network-based Regression model across two disparate machine learning environments: Rust (utilizing the Burn framework) and Python (utilizing PyTorch). It covers the distinct model architecture decisions, dataset handling strategies, and specialized pipeline deployment techniques leveraging Network File Systems (NFS) mapping via Docker bounds. + +\section{Model Architecture and Mathematical Formulation} + +\subsection{Mathematical Foundation} +The core mathematical foundation deployed across both frameworks is a classical Feed-Forward Neural Network consisting of a single hidden dimension mapping inputs directly onto a continuous single-variable regression output. + +For a given input feature vector $X \in \mathbb{R}^N$ (where $N$ dictates the feature size depending on the target dataset), the network's forward transformation can be represented sequentially as: +\begin{align} + Z_1 &= X \cdot W_1^T + b_1 \quad &\text{(Input Projection)} \\ + A_1 &= \max(0, Z_1) \quad &\text{(ReLU Activation)} \\ + \hat{Y} &= A_1 \cdot W_2^T + b_2 \quad &\text{(Output Projection)} +\end{align} + +Where: +\begin{itemize} + \item $W_1 \in \mathbb{R}^{H \times N}$ and $b_1 \in \mathbb{R}^H$ map the inputs onto the hidden vector space $H$. + \item $\max(0, \cdot)$ denotes the Non-Linear Rectified Linear Unit (ReLU) mapping algorithm. + \item $W_2 \in \mathbb{R}^{1 \times H}$ and $b_2 \in \mathbb{R}$ collapse the hidden abstraction onto the finalized regression scalar prediction $\hat{Y}$. +\end{itemize} + +\subsection{Architectural Configurations} +While the mathematical foundations are identical, implementations slightly differ based on dataset selections within the modules: +\begin{itemize} + \item \textbf{PyTorch Architecture:} Configures $N=13$ input features mapping to $H=64$ hidden parameters. + \item \textbf{Rust (Burn) Architecture:} Configures $N=8$ input features concurrently mapping to $H=64$ hidden parameters. +\end{itemize} +In both configurations, standard parameter biases (`bias=True`) are included and automatically initialized. + +\section{Training Pipelines} + +Both codebases train the model iteratively tracking gradients via the Adam optimizer scaled against Mean Squared Error (MSE) loss logic: +\[ \text{MSE} = \frac{1}{B} \sum_{i=1}^{B} (Y_i - \hat{Y}_i)^2 \] + +\subsection{PyTorch Context} +\begin{itemize} + \item \textbf{Data Loading:} Automatically pulls the \textbf{Boston Housing} dataset array (.npz file) from an external Google API via `urllib` and manually partitions it down into an explicit $80/20$ split. + \item \textbf{Telemetry Metrics:} Generates explicit hardware tracking loops inside the main epoch runner. Uses the `psutil` library to compute and stream epoch `iteration\_speed`, raw RAM consumption, and `cpu\_temp` hardware sensors parallel to the loss parameters. +\end{itemize} + +\subsection{Rust (Burn) Context} +\begin{itemize} + \item \textbf{Data Loading:} Links into Huggingface's dataset registry asynchronously targeting the \textbf{California Housing} SQLized splits mapping onto memory arrays via localized `HousingDistrictItem` structs. + \item \textbf{Normalization Mapping:} Computes spatial min-max normalizations programmatically over inputs during training: + \[ X_{norm} = \frac{X - \text{min}}{\text{max} - \text{min}} \] + This logic restricts features within standard boundaries precluding exploding gradient derivations. +\end{itemize} + +\section{Inference Pipeline and Docker NFS Integration} + +Deploying these isolated pipelines necessitates radically different execution strategies, highlighting Python's heavyweight runtime dependency bottlenecks versus Rust's compile-time optimizations. + +\subsection{PyTorch Inference Architecture} +Standard PyTorch Docker environments routinely eclipse several gigabytes due to CUDA bindings and generic scientific computation loops. To circumvent this inside microservices, the PyTorch inference pipeline mandates a hybrid Network File System (NFS) mapping architecture: +\begin{enumerate} + \item \textbf{NFS Mounting (\texttt{mount\_libs.sh}):} Installs an external `nfs-common` client locally and binds the extensive python library volume from an external dedicated storage server (`172.16.203.14`) into the host machine's `/mnt/LSTM-libs` map. + \item \textbf{Lightweight Container Image:} The backend \texttt{Dockerfile} avoids `pip install` commands completely, simply initializing a barebone `nvidia/cuda:12.1.1` image mapping Python $3.11$ system links. + \item \textbf{Volume Inject (\texttt{run\_container.sh}):} The script initializes the container enforcing `-v` flags that sync the NFS `/mnt/LSTM-libs` directory seamlessly onto the Docker's `/external-libs`. Crucially, it overrides the system \texttt{PYTHONPATH} to target those external `site-packages` at runtime. + \item \textbf{Execution:} The `FastAPI` instance loads, bypasses massive disk pulls, links the models iteratively, and fields inbound `HousingFeatures` lists continuously. +\end{enumerate} + +\subsection{Rust (Burn) Inference Architecture} +Rust handles Docker microservices inherently via statically linked deployments: +\begin{itemize} + \item \textbf{Multi-Stage Compiling:} Executes a build phase operating within an oversized `rust:1.92-alpine` chain, ejecting the resulting binary onto an isolated stripped `alpine:3.23` environment structure. + \item \textbf{Native Routing:} Utilizes \texttt{Axum} servers to establish the HTTP logic endpoints securely routing JSON payloads mapping to specific feature names (e.g. \texttt{median\_income}, \texttt{house\_age}). +\end{itemize} + +\section{Rust: Regression Model Performance Analysis} + +The regression model used in this experiment was a simple feed-forward neural network consisting of one hidden layer followed by an output layer. The model was designed to predict the median house value based on eight input features. + +\subsection{Model Architecture} + +The architecture of the regression model is shown below: + +\begin{verbatim} +RegressionModel { + input_layer: Linear {d_input: 8, d_output: 64, bias: true, params: 576} + output_layer: Linear {d_input: 64, d_output: 1, bias: true, params: 65} + activation: Relu + params: 641 +} +\end{verbatim} + +The model contains: + +\begin{itemize} + \item An input layer that maps 8 input features to 64 hidden units + \item A ReLU activation function applied after the hidden layer + \item An output layer that maps the 64 hidden units to a single scalar value +\end{itemize} + +The total number of trainable parameters in the model was only 641, making it a lightweight model suitable for fast training and inference. + +\subsection{Training Configuration} + +The model was trained for 100 epochs. A constant learning rate of: + +\[ +1.0 \times 10^{-3} +\] + +was used throughout the entire training process. + +\subsection{Training Performance} + +The training loss decreased substantially over the 100 epochs. Initially, the model started with a training loss of 3.086 during the first epoch. By the final epoch, the loss had reduced to 0.414. + +This significant reduction in loss indicates that the model successfully learned the underlying relationship between the input features and the target variable. + +\subsection{Validation Performance} + +Validation loss also showed a considerable improvement during training. The validation loss decreased from 4.132 in the first epoch to a minimum of 0.635 at epoch 51. + +The difference between the final training loss and the minimum validation loss suggests that the model achieved good generalization performance without severe overfitting. + +\begin{table}[h] +\centering +\caption{Training and Validation Metrics Summary for the Regression Model} +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Loss & 0.414 & 100 & 3.086 & 1 \\ +Train & Learning Rate & $1.0 \times 10^{-3}$ & 1 & $1.0 \times 10^{-3}$ & 100 \\ +Train & CPU Memory (GB) & 2.125 & 4 & 2.325 & 56 \\ +Train & CPU Usage (\%) & 19.539 & 54 & 37.989 & 11 \\ +\hline +Valid & Loss & 0.635 & 51 & 4.132 & 1 \\ +Valid & CPU Memory (GB) & 2.124 & 3 & 2.325 & 55 \\ +Valid & CPU Usage (\%) & 19.550 & 54 & 37.960 & 11 \\ +\hline +\end{tabular} +\label{tab:regression_metrics_summary} +\end{table} + +\subsection{Prediction Example} + +A sample prediction generated by the model is shown below: + +\begin{verbatim} +Predicted 2.021734 Expected 2.158 +\end{verbatim} + +The predicted value is reasonably close to the expected value, indicating that the model was able to approximate the target variable with acceptable accuracy. + +Since the median house value was measured in units of 100,000 dollars, the prediction corresponds to: + +\begin{itemize} + \item Predicted value: approximately 202,173 dollars + \item Expected value: approximately 215,800 dollars +\end{itemize} + +\subsection{Predicted vs. Expected Distribution} + +The predicted-versus-expected plot suggests that the model captures the general trend in the target values, although some prediction errors remain for certain samples. + +Most of the predicted values appear concentrated around the central region of the distribution, indicating that the model performs better on common house value ranges than on extreme values. + +\subsection{Resource Utilization} + +The model required relatively little memory during execution. Training memory usage ranged from 2.125 GB to 2.325 GB, while validation memory usage ranged from 2.124 GB to 2.325 GB. + +CPU utilization remained moderate throughout training. Training CPU usage ranged from 19.539\% to 37.989\%, while validation CPU usage ranged from 19.550\% to 37.960\%. + +CPU temperature values were unavailable and therefore recorded as NaN. + +\subsection{Execution Time and Failure} + +The complete training and evaluation process required: + +\begin{itemize} + \item Real time: 3 minutes and 18.257 seconds + \item User CPU time: 4 minutes and 13.554 seconds + \item System CPU time: 50.340 seconds +\end{itemize} + +\section{Language Specific Implementation Details} + +\subsection{PyTorch-Specific Paradigms} +\begin{itemize} + \item \textbf{Thread Clamping:} Due to inference optimization restrictions (especially running CPU variations alongside container structures), the `app.py` enforces explicit core binding calls via `torch.set\_num\_threads(1)` and `torch.set\_num\_interop\_threads(1)` securing computational resources and restricting OS context-switching overheads. + \item \textbf{Matrix Array Verifications:} Manually inspects raw matrix vector mappings validating dimensions dynamically against numeric constraints: \texttt{len(x) != NUM\_FEATURES} triggering runtime panics before pipeline evaluations fail. + \item \textbf{Manual Hardware Moving:} The framework is heavily littered with required `.to(device)` mapping configurations switching inputs, datasets, targets, and models manually between the host and external components. +\end{itemize} + +\subsection{Rust (Burn)-Specific Paradigms} +\begin{itemize} + \item \textbf{Generic Compile-Time Shapes:} Dimension mappings and tensor validations are fundamentally enforced inside the Rust compiler boundaries via `` arrays indicating batches of distinct input structures mapping to `targets: Tensor`. Invalid sizes fail compilation, voiding the requirement for manual PyTorch matrix validations. + \item \textbf{Struct Batching Protocols:} Inference doesn't evaluate primitive float arrays. Intead, the API relies on executing an overarching `HousingBatcher` which transforms specific struct domains (\texttt{HousingDistrictItem}) safely into tensor primitives while executing implicit `self.normalizer.to\_device(device)` logic silently against constants behind boundaries. + \item \textbf{Record Deserialization:} States are strictly detached from models via standard `.mpk` maps. They invoke explicit \texttt{NoStdTrainingRecorder::new().load()} tracking traits unbinding memory limits inherent to standard dict serialization configurations natively loaded via `RegressionModelConfig`. +\end{itemize} + +\newpage +\section{Python: Regression Model Architecture and Training Performance} + +The regression model used in this experiment is a lightweight fully connected neural network with a small number of parameters (961 total). The model is optimized using mean squared error loss. + + + +The model was trained for 100 epochs. Training loss decreased significantly from 8265.55 to 69.86 (99.15\% reduction), while validation loss decreased from 9045.34 to 55.70. + +Despite strong loss reduction, the model struggled to achieve good generalization. The validation $R^2$ score remained negative (-1.07), indicating that the model performs worse than a simple baseline predictor. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Loss & 68.50 & 100 & 8265.55 & 1 \\ +Train & RMSE & 8.39 & 100 & 91.53 & 1 \\ +Train & MAE & 6.29 & 100 & 90.33 & 1 \\ +Train & R$^2$ & -96.93 & 1 & 0.1767 & 100 \\ +Train & Grad Norm (Total) & 118.80 & -- & 112876.03 & 1 \\ +Train & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Train & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Train & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +Valid & Loss & 50.93 & 65 & 9045.34 & 1 \\ +Valid & RMSE & 7.14 & 65 & 95.11 & 1 \\ +Valid & MAE & 6.00 & 65 & 93.52 & 1 \\ +Valid & R$^2$ & -335.40 & 1 & -0.8940 & 65 \\ +Valid & Iteration Speed (it/s) & 0.69 & 1 & 14.39 & 54 \\ +Valid & CPU Memory (GB) & 0.87 & -- & 1.02 & -- \\ +Valid & CPU Usage (\%) & 45.7 & -- & 97.6 & -- \\ +\hline + +\end{tabular} +\caption{Regression Model Training and Validation Metrics Summary} +\end{table} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 47.39 seconds + \item Average Epoch Time: 0.225 seconds + \item Iteration Speed (Mean): 10.36 it/s + \item Gradient Norm (Mean): 7981.31 + \item NaN Events: 0 + \item Convergence: Non-monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate that while optimization was successful in reducing loss, the model lacks sufficient capacity or feature representation to generalize well. The persistently negative validation $R^2$ suggests underfitting or a mismatch between model complexity and data characteristics. + + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 3.98GB & 973MB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + + + +\end{document} diff --git a/latex_reports/text_classification_news.tex b/latex_reports/text_classification_news.tex new file mode 100644 index 0000000..dafcd2c --- /dev/null +++ b/latex_reports/text_classification_news.tex @@ -0,0 +1,652 @@ +\documentclass[11pt,a4paper]{article} + +\usepackage{geometry} +\geometry{margin=1in} + +\usepackage{amsmath,amssymb} +\usepackage{graphicx} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{enumitem} +\usepackage{array} +\usepackage{float} +\usepackage{listings} +\usepackage{pdfpages} + + +\lstdefinelanguage{Dockerfile}{ + keywords={FROM, RUN, CMD, LABEL, MAINTAINER, EXPOSE, ENV, ADD, COPY, ENTRYPOINT, VOLUME, USER, WORKDIR, ARG, ONBUILD, STOPSIGNAL, HEALTHCHECK, SHELL}, + sensitive=true, + comment=[l]{\#}, + morestring=[b]", +} + +\title{\textbf{Text Classification}} +\date{\today} + +\begin{document} + +\maketitle + +\section{Model Architecture and Training Strategy} + +The text classification system is built using the \texttt{Burn} framework in Rust, leveraging a Transformer-based architecture for feature extraction and a linear classification head. This section details the mathematical formulation of the model and the strategy employed for training. + +\subsection{Model Architecture} +The core of the model is a Transformer Encoder, which processes a sequence of token embeddings to capture contextual relationships. The architecture consists of three primary stages: embedding, encoding, and classification. + +\subsubsection{Embedding Layer} +Input text is tokenized and converted into a sequence of indices $X \in \mathbb{N}^{B \times L}$, where $B$ is the batch size and $L$ is the sequence length. The model utilizes two parallel embedding layers: +\begin{enumerate} + \item \textbf{Token Embedding ($E_{tok}$)}: Maps token indices to dense vectors of dimension $d_{model}$. + \item \textbf{Positional Embedding ($E_{pos}$)}: Maps position indices $[0, \dots, L-1]$ to dense vectors of dimension $d_{model}$ to inject sequence order information. +\end{enumerate} + +The final embedding representation $E$ is obtained by averaging the token and positional embeddings: +\begin{equation} + E = \frac{E_{tok}(X) + E_{pos}(\text{positions})}{2} +\end{equation} + +\subsubsection{Transformer Encoder} +The embedding tensor $E$ is passed through a multi-layer Transformer Encoder. Each layer consists of a Multi-Head Self-Attention (MHSA) mechanism followed by a Position-wise Feed-Forward Network (FFN), with residual connections and layer normalization. + +The configuration used in this implementation is as follows: +\begin{itemize} + \item \textbf{Model Dimension ($d_{model}$)}: 256 + \item \textbf{Feed-Forward Dimension ($d_{ff}$)}: 1024 + \item \textbf{Number of Heads ($N_{heads}$)}: 8 + \item \textbf{Number of Layers ($N_{layers}$)}: 4 + \item \textbf{Normalization}: Layer norm applied before sub-layers (Pre-Norm). +\end{itemize} + +Let $H = \text{TransformerEncoder}(E)$, where $H \in \mathbb{R}^{B \times L \times d_{model}}$ represents the contextualized representations of the input sequence. + +\subsubsection{Classification Head} +For classification, the model utilizes the representation of the first token (typically acting as the [CLS] token) from the encoded sequence. This vector is passed through a linear layer to project it into the class space: +\begin{equation} + Y = \text{Linear}(H_{[:, 0, :]}) +\end{equation} +where $Y \in \mathbb{R}^{B \times N_{classes}}$ represents the logits. For inference, a Softmax function is applied to obtain probabilities: +\begin{equation} + \hat{P} = \text{Softmax}(Y) +\end{equation} + +\begin{table}[h] +\centering +\caption{Model Architecture Summary} +\label{tab:model_arch} +\begin{tabular}{|l|l|c|c|} +\hline +\textbf{Component} & \textbf{Configuration / Details} & \textbf{Input Shape} & \textbf{Output Shape} \\ \hline +Token Embedding & $V \to d_{model}$ ($V$: Vocab Size) & $(B, L)$ & $(B, L, 256)$ \\ \hline +Pos Embedding & $L_{max} \to d_{model}$ & $(B, L)$ & $(B, L, 256)$ \\ \hline +Embedding Merge & Average ($E_{tok} + E_{pos}$) & - & $(B, L, 256)$ \\ \hline +Transformer Block & 4 Layers, 8 Heads, $d_{ff}=1024$ & $(B, L, 256)$ & $(B, L, 256)$ \\ \hline +Feature Extract & Slice First Token (Index 0) & $(B, L, 256)$ & $(B, 256)$ \\ \hline +Classifier Head & Linear ($256 \to N_{classes}$) & $(B, 256)$ & $(B, N_{classes})$ \\ \hline +\end{tabular} +\end{table} + + + +\section{Rust Backend: Load Testing with Locust} + +The performance of the Rust-based backend was evaluated using the Locust load testing framework. The objective was to analyze system behavior under concurrent user load and measure key performance characteristics such as throughput and latency. + +\textbf{Testing Setup:} +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Rust (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{RustLocust/text.pdf} + + +\section{Python Backend: Load Testing with Locust} + +The performance of the Python-based backend was evaluated using the Locust load testing framework. The goal was to assess system behavior under concurrent user load and analyze key performance characteristics such as throughput and response latency. + +\textbf{Testing Setup:} +\begin{itemize} + \item Tool: Locust + \item Backend: Python (HTTP service) + \item Test Type: Concurrent user load simulation + \item Environment: Linux system +\end{itemize} + +\textbf{Dashboard Visualization:} + +\textbf{Full Report:} + +The complete load testing dashboard has been exported as a PDF and is included below for detailed inspection. + +\includepdf[pages=-]{PythonLocust/text.pdf} + + +\subsection{Training Strategy} +The model is trained using a supervised learning approach with the following configuration: + +\subsubsection{Loss Function} +The training objective is to minimize the Cross-Entropy Loss between the predicted logits $Y$ and the ground truth class labels $C$: +\begin{equation} + \mathcal{L} = \text{CrossEntropy}(Y, C) = -\sum_{c=1}^{N_{classes}} \mathbb{1}_{c=C} \log\left(\frac{e^{Y_c}}{\sum_{j} e^{Y_j}}\right) +\end{equation} + +\subsubsection{Optimization} +We employ the **Adam** optimizer with the following parameters: +\begin{itemize} + \item \textbf{Weight Decay}: $5 \times 10^{-5}$ + \item \textbf{Beta Coefficients}: Standard defaults (typically $\beta_1=0.9, \beta_2=0.999$) +\end{itemize} + +\subsubsection{Learning Rate Scheduling} +A **Noam Learning Rate Scheduler** is used to stabilize training. The learning rate increases linearly during a warmup phase and then decays proportionally to the inverse square root of the step number. +\begin{equation} + LR = d_{model}^{-0.5} \cdot \min(step\_num^{-0.5}, step\_num \cdot warmup\_steps^{-1.5}) +\end{equation} +\begin{itemize} + \item \textbf{Warmup Steps}: 1000 + \item \textbf{Base Learning Rate}: 0.01 +\end{itemize} + +\subsubsection{Metrics} +During training and validation, the following metrics are tracked to monitor performance: +\begin{itemize} + \item \textbf{Loss}: Cross-Entropy Loss. + \item \textbf{Accuracy}: Percentage of correct predictions. + \item \textbf{F1-Score, Precision, Recall}: Macro-averaged metrics to account for class balance. +\end{itemize} + +\section{Burn Code Specifications} + +This section outlines the significant implementation details of the text classification system, focusing on the architectural choices in \texttt{model.rs} and the robust training pipeline defined in \texttt{training.rs}. +\subsection{Model Implementation (\texttt{model.rs})} +The \texttt{TextClassificationModel} leverages the \textbf{Burn} framework's modular design to implement a Transformer-based classifier. Key features of this implementation include: +\begin{itemize} + \item \textbf{Dual Embedding Strategy:} The model employs two distinct embedding layers: \texttt{embedding\_token} for semantic content and \texttt{embedding\_pos} for positional information. A unique characteristic of this implementation is the fusion strategy, where these embeddings are combined via averaging: + \[ + E_{final} = \frac{E_{pos} + E_{token}}{2} + \] + This differs from the standard summation approach often found in BERT implementations, potentially stabilizing the initial magnitude of the embedding vectors. + + \item \textbf{Configurable Architecture:} The system uses a \texttt{TextClassificationModelConfig} struct derived with the \texttt{Config} macro. This allows for type-safe and serializable hyperparameter management, ensuring the model architecture (hidden size, vocabulary size, sequence length) can be easily saved, loaded, and reproducible. + + \item \textbf{Masked Attention:} The forward pass actively utilizes padding masks (\texttt{mask\_pad}). These masks are passed into the \texttt{TransformerEncoderInput}, ensuring that the self-attention mechanism strictly ignores padding tokens, which is critical for handling variable-length text sequences correctly. + + \item \textbf{Separation of Train and Inference Logic:} The model explicitly implements the \texttt{TrainStep} and \texttt{InferenceStep} traits. + \begin{itemize} + \item \textbf{Training:} Returns a \texttt{ClassificationOutput} struct containing the calculated Cross-Entropy loss for backpropagation. + \item \textbf{Inference:} Returns raw probabilities by applying a softmax activation on the output logits, facilitating direct class prediction. + \end{itemize} +\end{itemize} +\subsection{Training Pipeline (\texttt{training.rs})} +The training module is designed for reliability and comprehensive observability. It integrates advanced optimization techniques and hardware-aware monitoring. +\begin{itemize} + \item \textbf{Noam Scheduler:} Transformer models are notoriously sensitive to learning rates. The code implements the \textbf{Noam Learning Rate Scheduler} (popularized by "Attention Is All You Need"), which features a linear warmup phase (1000 steps) followed by an inverse square root decay based on the model dimension ($d_{model}$). This prevents gradient explosions during early training stages. + + \item \textbf{Distributed Training Support:} The implementation explicitly handles distributed computing scenarios. It utilizes Rust's feature flags (\texttt{cfg[feature = "ddp"]}) to switch between single-device training and \textbf{Distributed Data Parallel (DDP)} strategies. When enabled, it employs a tree-based \texttt{AllReduceStrategy} for synchronizing gradients across multiple GPUs or nodes. + + \item \textbf{Comprehensive Telemetry:} The training loop is instrumented with an extensive suite of metrics beyond simple accuracy. It tracks: + \begin{itemize} + \item \textbf{Classification Metrics:} Macro-averaged F1-Score, Precision, and Recall, providing a holistic view of model performance on imbalanced datasets. + \item \textbf{Hardware Diagnostics:} CPU temperature, memory usage, and utilization are logged alongside training progress, aiding in the detection of thermal throttling or memory leaks during long training runs. + \end{itemize} + + \item \textbf{Efficient Data Sampling:} To manage large datasets efficiently, the loader utilizes a \texttt{SamplerDataset}. This limits the effective epoch size to 50,000 training samples and 5,000 validation samples, allowing for rapid iteration and feedback loops without needing to process the entire corpus in every epoch. +\end{itemize} + +\subsection{Conditional Compilation} +I think we should document a bit about this. + +\section{Rust Docker image} + +\section{Rust Inference Code} + +\section{Rust: Transformer-Based Text Classification Model Performance} + +The text classification model used in this experiment was based on a Transformer encoder architecture. The model consisted of token embeddings, positional embeddings, a multi-layer Transformer encoder, and a final linear classification layer. + +\subsection{Model Architecture} + +The architecture of the model is shown below: + +\begin{verbatim} +TextClassificationModel { + transformer: TransformerEncoder { + d_model: 256, + d_ff: 1024, + n_heads: 8, + n_layers: 4, + dropout: 0.1, + norm_first: true, + quiet_softmax: true, + params: 3159040 + } + embedding_token: Embedding { + n_embedding: 28996, + d_model: 256, + params: 7422976 + } + embedding_pos: Embedding { + n_embedding: 256, + d_model: 256, + params: 65536 + } + output: Linear { + d_input: 256, + d_output: 4, + bias: true, + params: 1028 + } + n_classes: 4 + params: 10648580 +} +\end{verbatim} + +The Transformer encoder used four encoder layers with eight attention heads per layer. Each layer had a model dimension of 256 and a feed-forward dimension of 1024. A dropout rate of 0.1 was used to reduce overfitting. + +The token embedding layer mapped a vocabulary of 28,996 tokens into 256-dimensional vectors. Positional embeddings of length 256 were also used so that the Transformer could capture token order information. + +The final output layer mapped the Transformer representation into four output classes. + +The total number of trainable parameters in the model was 10,648,580. + +\subsection{Training Configuration} + +The model was trained for a total of 5 epochs. During training, the learning rate decayed from $1.107 \times 10^{-5}$ in the first epoch to $3.733 \times 10^{-6}$ in the final epoch. + +\subsection{Training Performance} + +Training accuracy improved steadily from 57.968\% in the first epoch to 81.474\% in the fifth epoch. Similarly, the training loss decreased from 0.981 to 0.507. + +The macro precision, recall, and F1-score also improved significantly during training: + +\begin{itemize} + \item Precision increased from 58.893\% to 81.606\% + \item Recall increased from 57.741\% to 81.334\% + \item F1-score increased from 50.201\% to 76.001\% +\end{itemize} + +These results indicate that the model learned meaningful semantic patterns in the text data over successive epochs. + +\subsection{Validation Performance} + +Validation performance also improved consistently. Validation accuracy increased from 72.280\% in the first epoch to a maximum of 81.640\% in the fourth epoch. + +The validation loss decreased from 0.731 to 0.507, showing that the model generalized reasonably well to unseen data. + +The validation precision, recall, and F1-score also showed strong improvement: + +\begin{itemize} + \item Precision increased from 72.230\% to 81.739\% + \item Recall increased from 72.258\% to 82.039\% + \item F1-score increased from 65.796\% to 76.509\% +\end{itemize} + +The relatively close values between training and validation accuracy suggest that the model did not suffer from severe overfitting. + +\begin{table}[h] +\centering +\caption{Training and Validation Metrics Summary for the Transformer-Based Text Classification Model} +% @kumar please layout sahi kar sakta hai? love you +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min.} & \textbf{Epoch} & \textbf{Max.} & \textbf{Epoch} \\ +\hline +Train & Accuracy & 57.968 & 1 & 81.474 & 5 \\ +Train & Loss & 0.507 & 5 & 0.981 & 1 \\ +Train & Precision@Top1 [Macro] & 58.893 & 1 & 81.606 & 5 \\ +Train & Recall@Top1 [Macro] & 57.741 & 1 & 81.334 & 5 \\ +Train & F1-Score@Top1 [Macro] & 50.201 & 1 & 76.001 & 5 \\ +Train & Learning Rate & $3.733 \times 10^{-6}$ & 5 & $1.107 \times 10^{-5}$ & 1 \\ +Train & CPU Memory (GB) & 2.401 & 2 & 2.736 & 5 \\ +Train & CPU Usage (\%) & 16.529 & 1 & 17.160 & 5 \\ +\hline +Valid & Accuracy & 72.280 & 1 & 81.640 & 4 \\ +Valid & Loss & 0.507 & 5 & 0.731 & 1 \\ +Valid & Precision@Top1 [Macro] & 72.230 & 1 & 81.739 & 5 \\ +Valid & Recall@Top1 [Macro] & 72.258 & 1 & 82.039 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 65.796 & 1 & 76.509 & 5 \\ +Valid & CPU Memory (GB) & 2.263 & 1 & 2.747 & 5 \\ +Valid & CPU Usage (\%) & 20.331 & 1 & 22.190 & 3 \\ +\hline +\end{tabular} +\label{tab:transformer_text_classification_metrics} +\end{table} + +\subsection{Resource Utilization} + +The CPU memory usage remained relatively stable throughout training. Training memory usage ranged from 2.401 GB to 2.736 GB, while validation memory usage ranged from 2.263 GB to 2.747 GB. + +CPU utilization also remained moderate, with training CPU usage ranging from 16.529\% to 17.160\% and validation CPU usage ranging from 20.331\% to 22.190\%. + +CPU temperature values were unavailable during the experiment and therefore recorded as NaN. + +\subsection{Execution Time and Failure} + +The complete training run required: + +\begin{itemize} + \item Real time: 32 minutes and 37.872 seconds + \item User CPU time: 36 minutes and 52.313 seconds + \item System CPU time: 1 minute and 41.258 seconds +\end{itemize} + +\section{PyTorch Training Pipeline} +This section details the Python implementation of the text classification training pipeline. The code mimics the architecture and logic of the Rust version to ensuring comparable performance and behavior. +\subsection{Code Highlights} +\begin{itemize} + \item \textbf{Custom Transformer Model:} + The \texttt{TextClassificationModel} is a custom \texttt{nn.Module} containing: + \begin{itemize} + \item Dual embedding layers (\texttt{embedding\_token} and \texttt{embedding\_pos}). + \item A unique fusion strategy averaging the two embeddings: $E = (E_{pos} + E_{tok}) / 2$. + \item A standard \texttt{TransformerEncoder} stack. + \item A classification head that projects the encoded features to the 4 output classes of the AG News dataset. + \end{itemize} + \item \textbf{Noam Learning Rate Scheduler:} + A custom \texttt{NoamLR} scheduler is implemented to replicate the specific warmup and decay behavior used in the Rust implementation (and the original "Attention Is All You Need" paper). + \[ + lr = \text{factor} \cdot (d_{model}^{-0.5}) \cdot \min(step^{-0.5}, step \cdot warmup^{-1.5}) + \] + This ensures stable training dynamics for the Transformer architecture. + \item \textbf{Dataset Handling:} + The code utilizes the Hugging Face \texttt{datasets} library to load the "ag\_news" dataset. It explicitly shuffles and subsets the data (50,000 train, 5,000 test) to match the constraints applied in the Rust implementation, ensuring a fair apples-to-apples comparison between the two languages. + \item \textbf{Collate Function with Padding Masks:} + A custom \texttt{collate\_fn} handles dynamic batching. It tokenizes text using the \texttt{bert-base-cased} tokenizer and generates a boolean padding mask. Note the inversion logic: PyTorch's \texttt{TransformerEncoder} expects \texttt{True} for padded positions (unlike some other implementations where 1 implies validity), requiring careful mask generation: + \begin{verbatim} + mask_pad = (encoding['attention_mask'] == 0) + \end{verbatim} + \item \textbf{Training Loop:} + The training loop is a standard PyTorch implementation using \texttt{tqdm} for progress tracking. It uses \texttt{CrossEntropyLoss} as the criterion and the \texttt{Adam} optimizer. Crucially, the scheduler step is called after every batch (not every epoch), consistent with the Noam schedule requirements. +\end{itemize} + + +\section{Python: Text Classification (News) Transformer Model} + +The model is a Transformer-based architecture designed for multi-class news classification. It consists of a multi-layer encoder with multi-head self-attention and feedforward networks. + +\textbf{Model Architecture:} + +\begin{verbatim} +Model { + transformer_encoder: { + d_model: 256, + nhead: 8, + num_layers: 4, + dim_feedforward: 1024 + } + max_seq_len: 256 + num_classes: 4 + total params: 10649092 +} +\end{verbatim} + +The model was trained for 5 epochs and showed steady convergence across all evaluation metrics. + +Training accuracy improved from 56.19\% to 79.70\%, while validation accuracy increased from 68.14\% to 79.00\%. +Training loss decreased from 1.0145 to 0.5483, and validation loss reduced from 0.8137 to 0.5628. + +The model achieved a macro F1-score of 0.7903 on the validation/test set, indicating reasonably strong classification performance for a Transformer trained over a small number of epochs. + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|c|c|c|c|} +\hline +\textbf{Split} & \textbf{Metric} & \textbf{Min} & \textbf{Epoch} & \textbf{Max} & \textbf{Epoch} \\ +\hline + +Train & Accuracy & 56.19 & 1 & 79.70 & 5 \\ +Train & Loss & 0.5483 & 5 & 1.0145 & 1 \\ +Train & Grad Norm (Total) & 10.39 & 4 & 22.95 & 3 \\ +Train & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Train & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Train & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +Valid & Accuracy & 68.14 & 1 & 79.00 & 5 \\ +Valid & Loss & 0.5628 & 5 & 0.8137 & 1 \\ +Valid & Precision@Top1 [Macro] & 0.7187 & 1 & 0.7942 & 5 \\ +Valid & Recall@Top1 [Macro] & 0.6815 & 1 & 0.7899 & 5 \\ +Valid & F1-Score@Top1 [Macro] & 0.6773 & 1 & 0.7903 & 5 \\ +Valid & Iteration Speed (it/s) & 44.33 & 2 & 45.05 & 1 \\ +Valid & CPU Memory (GB) & 1.12 & -- & 1.12 & -- \\ +Valid & CPU Usage (\%) & 22.2 & -- & 23.5 & -- \\ +\hline + +\end{tabular} +\caption{Transformer Text Classification Training and Validation Metrics} +\end{table} + +\textbf{Training Efficiency and Stability:} +\begin{itemize} + \item Total Training Time: 706.99 seconds + \item Average Epoch Time: 139.83 seconds + \item Iteration Speed (Mean): 44.70 it/s + \item Gradient Norm (Mean): 15.75 + \item GPU Memory Usage: 178.79 MB + \item NaN Events: 0 + \item Convergence: Monotonic loss decrease + \item Overfitting Detected: No +\end{itemize} + +The results indicate stable training and consistent improvement across epochs. While performance is lower than simpler CNN-based tasks, this is expected due to the increased complexity of natural language understanding tasks. + +\section{PyTorch Inference Pipeline Docker Image: Hybrid NFS and Docker Inference Architecture} + +This section details the hybrid deployment strategy designed to optimize Docker image size and leverage a centralized machine learning environment. The architecture splits the responsibilities between a \textbf{Library VM} (storage-heavy) and a \textbf{Docker VM} (compute-centric). + +\subsection{Architecture Overview} + +The system comprises two primary components: +\begin{enumerate} + \item \textbf{Library VM (NFS Server)}: Hosts the heavy Python environment, including PyTorch, Transformers, and CUDA libraries. This environment is exported via NFS. + \item \textbf{Docker VM (Inference Client)}: Runs a lightweight Docker container that mounts the external libraries at runtime. +\end{enumerate} + +\subsection{Implementation Details} + +\subsubsection{1. Library Sharing via NFS} +The Library VM exports the directory containing the Python site-packages. On the Docker VM, this directory is mounted using the \texttt{mount\_libs.sh} script. + +\begin{lstlisting}[language=bash, caption={Mounting the NFS Library Volume}] +# Configuration from mount_libs.sh +NFS_SERVER_IP="172.16.203.14" +NFS_EXPORT_PATH="/home/iiitb/Documents/textClassificationVolume" +LOCAL_MOUNT_POINT="/mnt/text-libs" + +# Mounting the remote volume +sudo mount -t nfs "$NFS_SERVER_IP:$NFS_EXPORT_PATH" "$LOCAL_MOUNT_POINT" +\end{lstlisting} + +\subsubsection{2. Lightweight Docker Image} +The Docker image is built using \texttt{Dockerfile.cpu} and excludes heavy ML libraries. It only contains the application code, the model weights, and minimal system dependencies. + +\begin{lstlisting}[language=Dockerfile, caption={Dockerfile.cpu Configuration}] +FROM python:3.12-slim + +# Point Python to the external NFS mount +ENV PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages + +# Copy only the app and model +COPY app.py ./ +COPY model_pytorch_text_classification/ag_news_model.pth ./model/ + +# No 'pip install torch' is performed here! +\end{lstlisting} + +\subsubsection{3. Runtime Execution} +The container is launched via \texttt{run\_inference.sh}, which mounts the NFS volume into the container at \texttt{/external-libs}. + +\begin{lstlisting}[language=bash, caption={Mounting the NFS Library Volume}] +docker run --gpus all \ + -v /mnt/text-libs:/external-libs \ + -v text_model_vol:/models \ + -e PYTHONPATH=/external-libs/text_env/lib/python3.12/site-packages \ + -p 8000:8000 \ + text_classification_image +\end{lstlisting} + +\subsection{Impact on Image Size} + +This architecture drastically reduces the storage footprint of the inference artifact. By decoupling the static libraries from the application logic, we achieve the following reduction: + +% \begin{table}[h] +% \centering +% \begin{tabular}{|l|c|c|} +% \hline +% \textbf{Component} & \textbf{Traditional Approach} & \textbf{Hybrid NFS Approach} \\ \hline +% Base Image (Python Slim) & $\sim$150 MB & $\sim$150 MB \\ \hline +% PyTorch & $\sim$3.5 GB & \textbf{0 MB (Mounted)} \\ \hline +% Transformers & $\sim$500 MB & \textbf{0 MB (Mounted)} \\ \hline +% Application Code & $<1$ MB & $<1$ MB \\ \hline +% Model Weights & $\sim$100 MB & $\sim$100 MB \\ \hline +% \textbf{Total Image Size} & \textbf{8.93 GB} & \textbf{$~250$ MB} \\ \hline +% \end{tabular} +% \caption{Comparison of Docker Image Sizes} +% \end{table} + +% This \textbf{99.03\% reduction} in image size results in: +% \begin{itemize} +% \item Faster deployment and rollback times. +% \item Significantly lower network bandwidth usage. +% \item Efficient storage utilization on the Docker VM. +% \end{itemize} + +\section{Hybrid Inference Architecture with NFS and Docker} + +This section outlines the architectural design of our hybrid machine learning deployment strategy, detailing the distinct roles of the Library VM and the Docker VM, and how they interact to optimize resource usage. + +\subsection{Library Virtual Machine (NFS Server)} + +The \textbf{Library VM} serves as the centralized repository for the heavy components of the machine learning environment. Its primary function is to host large, static dependencies such as the Python runtime environment, deep learning frameworks (e.g., PyTorch, TensorFlow), and specialized libraries (e.g., Transformers, CUDA routines). + +By consolidating these resource-intensive libraries on a single machine, we avoid the redundancy of installing them on every inference node. This machine acts as a Network File System (NFS) server, exporting its directory structure to be accessed by other machines in the network. + +\subsubsection{What is an NFS Server?} + +A \textbf{Network File System (NFS)} server is a computer that allows other machines (clients) to access its files over a network as if they were stored locally. In our architecture, the NFS server "shares" the directory containing the Python libraries. The client machines can then read these files directly, eliminating the need to physically copy the heavy libraries to each client. + +\subsection{Docker Virtual Machine (Inference Node)} + +The \textbf{Docker VM} is the compute-centric node responsible for executing the inference workload. It hosts the Docker engine and runs the lightweight containerized application. + +This machine does not permanently store the heavy ML libraries. instead, it mounts the shared directory from the Library VM at runtime. reliable network connectivity to the Library VM ensures that the Docker container has immediate access to the necessary software dependencies. + +\subsection{Hybrid Deployment Strategy} + +The hybrid strategy combines the isolation and portability of Docker with the efficiency of centralized storage. + +\begin{enumerate} + \item \textbf{Decoupling Environment and Application}: We separate the rapidly changing application code (API logic, business rules) from the slowly changing environment (Python packages). The application code resides inside the Docker image, while the environment resides on the NFS share. + \item \textbf{Runtime Linking}: When the Docker container starts, it mounts the NFS share. The container's environment variables are configured to add this mounted path to its Python path. This allows the Python interpreter inside the container to import modules (like \texttt{torch} or \texttt{transformers}) from the network share as if they were installed locally. + \item \textbf{Drastic Image Reduction}: Since the Docker image only contains the application code and minimal system dependencies, its size is reduced from several gigabytes to a few hundred megabytes. This facilitates rapid deployments, faster scaling, and reduced storage costs. +\end{enumerate} + +This architecture essentially transforms the Docker container into a lightweight "shell" that borrows its heavy "engine" from the Library VM only when needed. + +\subsection{Identifying the Virtual Machine Roles} + +The architecture explicitly designates two separate machines for distinct purposes. Based on the configuration scripts, their roles are defined as follows: + +\subsubsection{1. The Library VM (Environment Host)} +This machine acts as the \textbf{storage backend} for the machine learning environment. +\begin{itemize} + \item \textbf{Role}: It hosts the actual Python environment (Torch, Transformers, etc.) on its local filesystem and exports it via NFS. + \item \textbf{Identifier}: In our configuration (see \texttt{mount\_libs.sh}), this machine is identified by the IP address \texttt{172.16.203.14}. + \item \textbf{Key Path}: The environment resides at \texttt{/home/iiitb/Documents/textClassificationVolume}. + \item \textbf{Action}: It does \textit{not} run the Docker container. Ideally, it simply stays online to serve files to other machines. +\end{itemize} + +\subsubsection{2. The Docker VM (Inference Runner)} +This machine acts as the \textbf{compute frontend} that serves the API. +\begin{itemize} + \item \textbf{Role}: It builds and runs the lightweight Docker container. It does not have the deep learning libraries installed on its own disk; it borrows them from the Library VM. + \item \textbf{Identifier}: This is the machine where you execute the \texttt{mount\_libs.sh} and \texttt{run\_inference.sh} scripts. + \item \textbf{Key Path}: It mounts the remote library to the local path \texttt{/mnt/text-libs}. + \item \textbf{Action}: It executes the \texttt{docker run} command, effectively "bringing the code to the data" (or in this case, bringing the library data to the code container). +\end{itemize} + +\begin{table}[h] +\centering +\begin{tabular}{|l|l|l|} +\hline +\textbf{Feature} & \textbf{Library VM} & \textbf{Docker VM} \\ \hline +\textbf{Primary Function} & Storage \& NFS Server & Model Inference \& API Hosting \\ \hline +\textbf{IP Address} & \texttt{172.16.203.14} & (Assigned by Network) \\ \hline +\textbf{Python Libs} & Stored Physically on Disk & Mounted via Network (NFS) \\ \hline +\textbf{Docker Image} & Not required & Builds \& Runs Lightweight Image \\ \hline +\end{tabular} +\caption{Distinction between Library VM and Docker VM} +\end{table} + + + + +\newpage + +\section{Container Comparison: Python vs Rust} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Feature} & \textbf{Python (GPU)} & \textbf{Rust (WGPU)} \\ +\hline +Image Name & text\_classification\_image & text\_class\_rs \\ +\hline +Size & 4.02 GB & $\sim$1 GB \\ +\hline +Backend & PyTorch + CUDA & Native Rust (WGPU) \\ +\hline +Startup Time & Slower & Faster \\ +\hline +Dependencies & Heavy (Torch, CUDA, Python) & Minimal (compiled binary) \\ +\hline +Deployment Complexity & Higher & Lower \\ +\hline +Flexibility & High (research-friendly) & Moderate \\ +\hline +Runtime Stability & Medium & High \\ +\hline +\end{tabular} +\caption{Comparison of Python GPU-based and Rust-based inference containers} +\end{table} + +\noindent +The Python-based container provides flexibility and rapid experimentation using the PyTorch ecosystem, but at the cost of larger image size and dependency complexity. In contrast, the Rust-based container offers a lightweight, production-ready solution with faster startup time and minimal runtime dependencies, making it more suitable for deployment scenarios. + + +\section{Model Size Comparison} + +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|} +\hline +\textbf{Model} & \textbf{Rust (.mpk/.bin)} & \textbf{Python (.pt/.pth)} \\ +\hline +MNIST & 1.1 MB & 2 MB \\ +\hline +Text Classification (AG News) & 21 MB & 40.6 MB \\ +\hline +Regression & 4 KB & 6 MB \\ +\hline +LSTM & 60 KB & 120 KB \\ +\hline +\end{tabular} +\caption{Comparison of model sizes between Rust and Python implementations} +\end{table} + +\noindent +Rust-based serialized models are consistently smaller than their Python counterparts. This reduction is most significant in simpler models such as regression, and remains substantial for larger models like text classification. The smaller footprint of Rust models makes them more suitable for lightweight deployment and resource-constrained environments. + + +\end{document} diff --git a/python_ml/benchmark/bench_lstm.py b/python_ml/benchmark/bench_lstm.py new file mode 100644 index 0000000..1ec8631 --- /dev/null +++ b/python_ml/benchmark/bench_lstm.py @@ -0,0 +1,15 @@ +from locust import HttpUser, task, between +import numpy as np + +IP = "127.0.0.1" +PORT = "9050" +SEQ_LEN = 4 + +class LoadTestprofile(HttpUser): + wait_time = between(0.1,0.2) + host = f"http://{IP}:{PORT}" + rng = rng = np.random.default_rng() + @task + def load_task(self): + random_list = self.rng.uniform(0.0, 10.0, SEQ_LEN).tolist() + _ = self.client.post("/predict",json=random_list) \ No newline at end of file diff --git a/python_ml/benchmark/bench_regression.py b/python_ml/benchmark/bench_regression.py new file mode 100644 index 0000000..8f0e019 --- /dev/null +++ b/python_ml/benchmark/bench_regression.py @@ -0,0 +1,44 @@ +from locust import HttpUser, task, between +import pandas as pd +import numpy as np + +IP = "127.0.0.1" +PORT = "9050" +PATH_TO_DSET = "test_data/regression/cali_housing.parquet" + +''' +# download outside the script + +import pandas as pd + +df = pd.read_parquet( + "hf://datasets/gvlassis/california_housing/data/test-00000-of-00001.parquet" +) +df.to_parquet("cali_housing.parquet") +''' + +class LoadTestprofile(HttpUser): + wait_time = between(0.1,0.2) + host = f"http://{IP}:{PORT}" + + @task + def load_task(self): + random_row = self.df.iloc[np.random.randint(0,len(self.df))] + text_to_send = { + 'MedInc' : random_row['MedInc'], + 'HouseAge' : random_row['HouseAge'], + 'AveRooms' : random_row['AveRooms'], + 'AveBedrms' : random_row['AveBedrms'], + 'Population' : random_row['Population'], + 'AveOccup' : random_row['AveOccup'], + 'Latitude' : random_row['Latitude'], + 'Longitude' : random_row['Longitude'], + 'MedHouseVal' : random_row['MedHouseVal'] + } + _ = self.client.post("/predict",json=text_to_send) + # is_correct = (response.json()['prediction'] == self.dict_class[random_row['label']]) + + + + def on_start(self): + self.df = pd.read_parquet(PATH_TO_DSET) \ No newline at end of file diff --git a/python_ml/benchmark/bench_text_class.py b/python_ml/benchmark/bench_text_class.py index 4dc1347..3423e7c 100644 --- a/python_ml/benchmark/bench_text_class.py +++ b/python_ml/benchmark/bench_text_class.py @@ -27,22 +27,22 @@ def load_task(self): text_to_send = { 'text' : random_row['text'] } - response = self.client.post("/predict",json=text_to_send) - is_correct = (response.json()['prediction'] == self.dict_class[random_row['label']]) - events.request.fire( - request_type="ML", - name="accuracy", - response_time=0, - response_length=1, - exception=None if is_correct else Exception("wrong"), - ) + _ = self.client.post("/predict",json=text_to_send) + # is_correct = (response.json()['prediction'] == self.dict_class[random_row['label']]) + # events.request.fire( + # request_type="ML", + # name="accuracy", + # response_time=0, + # response_length=1, + # exception=None if is_correct else Exception("wrong"), + # ) def on_start(self): self.df = pd.read_parquet(PATH_TO_DSET) - self.dict_class = { - 0 : "World", - 1 : "Sports", - 2 : "Business", - 3 : "Technology" - } \ No newline at end of file + # self.dict_class = { + # 0 : "World", + # 1 : "Sports", + # 2 : "Business", + # 3 : "Technology" + # } \ No newline at end of file diff --git a/python_ml/pytorch/regression/Inference/app.py b/python_ml/pytorch/regression/Inference/app.py index 5e4ea49..366e1a1 100644 --- a/python_ml/pytorch/regression/Inference/app.py +++ b/python_ml/pytorch/regression/Inference/app.py @@ -11,7 +11,10 @@ # Constants (must match training) # ========================================================== -NUM_FEATURES = 13 +NUM_FEATURES = 8 + +FEATURES_MIN = torch.tensor([0.4999, 1., 0.8461, 0.375, 3., 0.6923, 32.54, -124.35], dtype=torch.float32) +FEATURES_MAX = torch.tensor([15., 52., 141.9091, 34.0667, 35682., 1243.3333, 41.95, -114.31], dtype=torch.float32) GENERATED_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "generated") @@ -81,12 +84,13 @@ class HousingFeatures(BaseModel): def preprocess(features): - x = np.array(features, dtype=np.float32) + x = torch.tensor(features, dtype=torch.float32) if len(x) != NUM_FEATURES: raise ValueError(f"Expected {NUM_FEATURES} features") - x = torch.tensor(x).unsqueeze(0) + x = x.unsqueeze(0) + x = (x - FEATURES_MIN) / (FEATURES_MAX - FEATURES_MIN) return x diff --git a/python_ml/pytorch/regression/Training/training.py b/python_ml/pytorch/regression/Training/training.py index 096b368..78e5b2a 100644 --- a/python_ml/pytorch/regression/Training/training.py +++ b/python_ml/pytorch/regression/Training/training.py @@ -3,10 +3,10 @@ import shutil import time import random -import urllib.request import psutil import torch import numpy as np +from datasets import load_dataset from dataclasses import dataclass from torch import nn from torch.utils.data import Dataset, DataLoader @@ -15,49 +15,12 @@ # Constants # ========================================================== -NUM_FEATURES = 13 +NUM_FEATURES = 8 OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "generated") -DATASET_URL = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/boston_housing.npz" -RAW_DATA_FILE = os.path.join(OUTPUT_DIR, "boston_housing.npz") - -TRAIN_FILE = os.path.join(OUTPUT_DIR, "train_data.npz") -VALID_FILE = os.path.join(OUTPUT_DIR, "valid_data.npz") - - -# ========================================================== -# Dataset preparation -# ========================================================== - -def prepare_dataset(): - - os.makedirs(OUTPUT_DIR, exist_ok=True) - - if not os.path.exists(RAW_DATA_FILE): - print("Downloading Boston Housing dataset...") - urllib.request.urlretrieve(DATASET_URL, RAW_DATA_FILE) - print("Download complete.") - - if not os.path.exists(TRAIN_FILE) or not os.path.exists(VALID_FILE): - - data = np.load(RAW_DATA_FILE) - - X = data["x"] - y = data["y"] - - split = int(0.8 * len(X)) - - X_train = X[:split] - y_train = y[:split] - - X_valid = X[split:] - y_valid = y[split:] - - np.savez(TRAIN_FILE, x=X_train, y=y_train) - np.savez(VALID_FILE, x=X_valid, y=y_valid) - - print("Dataset prepared.") +FEATURES_MIN = torch.tensor([0.4999, 1., 0.8461, 0.375, 3., 0.6923, 32.54, -124.35], dtype=torch.float32) +FEATURES_MAX = torch.tensor([15., 52., 141.9091, 34.0667, 35682., 1243.3333, 41.95, -114.31], dtype=torch.float32) # ========================================================== @@ -66,9 +29,28 @@ def prepare_dataset(): class HousingDataset(Dataset): - def __init__(self, inputs, targets): - self.inputs = torch.tensor(inputs, dtype=torch.float32) + def __init__(self, split="train"): + hf_dataset = load_dataset("gvlassis/california_housing", split=split) + + # Extract features in the correct order: + # MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude + features = [ + hf_dataset["MedInc"], + hf_dataset["HouseAge"], + hf_dataset["AveRooms"], + hf_dataset["AveBedrms"], + hf_dataset["Population"], + hf_dataset["AveOccup"], + hf_dataset["Latitude"], + hf_dataset["Longitude"] + ] + + targets = hf_dataset["MedHouseVal"] + + inputs = torch.tensor(features, dtype=torch.float32).T self.targets = torch.tensor(targets, dtype=torch.float32) + + self.inputs = (inputs - FEATURES_MIN) / (FEATURES_MAX - FEATURES_MIN) def __len__(self): return len(self.inputs) @@ -78,21 +60,11 @@ def __getitem__(self, idx): @staticmethod def train(): - - prepare_dataset() - - data = np.load(TRAIN_FILE) - - return HousingDataset(data["x"], data["y"]) + return HousingDataset("train") @staticmethod def validation(): - - prepare_dataset() - - data = np.load(VALID_FILE) - - return HousingDataset(data["x"], data["y"]) + return HousingDataset("validation") # ========================================================== diff --git a/reports/rust/Screenshots/LSTM_TRPS.png b/reports/rust/Screenshots/LSTM_TRPS.png new file mode 100644 index 0000000..01c054e Binary files /dev/null and b/reports/rust/Screenshots/LSTM_TRPS.png differ diff --git a/reports/rust/Screenshots/LSTM_response_times.png b/reports/rust/Screenshots/LSTM_response_times.png new file mode 100644 index 0000000..2cba5a2 Binary files /dev/null and b/reports/rust/Screenshots/LSTM_response_times.png differ diff --git a/reports/rust/Screenshots/MNIST_TRPS.png b/reports/rust/Screenshots/MNIST_TRPS.png new file mode 100644 index 0000000..1fdbc62 Binary files /dev/null and b/reports/rust/Screenshots/MNIST_TRPS.png differ diff --git a/reports/rust/Screenshots/MNIST_request_time.png b/reports/rust/Screenshots/MNIST_request_time.png new file mode 100644 index 0000000..6465c81 Binary files /dev/null and b/reports/rust/Screenshots/MNIST_request_time.png differ diff --git a/reports/rust/Screenshots/regression_TRPS.png b/reports/rust/Screenshots/regression_TRPS.png new file mode 100644 index 0000000..6341b8a Binary files /dev/null and b/reports/rust/Screenshots/regression_TRPS.png differ diff --git a/reports/rust/Screenshots/regression_respose_time.png b/reports/rust/Screenshots/regression_respose_time.png new file mode 100644 index 0000000..1bcd9b6 Binary files /dev/null and b/reports/rust/Screenshots/regression_respose_time.png differ diff --git a/reports/rust/Screenshots/text_class_respose_time.png b/reports/rust/Screenshots/text_class_respose_time.png new file mode 100644 index 0000000..88f0e45 Binary files /dev/null and b/reports/rust/Screenshots/text_class_respose_time.png differ diff --git a/reports/rust/Screenshots/text_class_trps.png b/reports/rust/Screenshots/text_class_trps.png new file mode 100644 index 0000000..aae0669 Binary files /dev/null and b/reports/rust/Screenshots/text_class_trps.png differ diff --git a/reports/rust/lstm.html b/reports/rust/lstm.html new file mode 100644 index 0000000..43ae5e7 --- /dev/null +++ b/reports/rust/lstm.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/reports/rust/mnist.html b/reports/rust/mnist.html new file mode 100644 index 0000000..6f73a2f --- /dev/null +++ b/reports/rust/mnist.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/reports/rust/regression.html b/reports/rust/regression.html new file mode 100644 index 0000000..0c554d7 --- /dev/null +++ b/reports/rust/regression.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/reports/rust/text_class.html b/reports/rust/text_class.html new file mode 100644 index 0000000..e9417ab --- /dev/null +++ b/reports/rust/text_class.html @@ -0,0 +1,124 @@ + + + + + + + + + + + Locust + + + + +
+ + + + + \ No newline at end of file diff --git a/rust_ml/Cargo.toml b/rust_ml/Cargo.toml index a24c637..ea509eb 100644 --- a/rust_ml/Cargo.toml +++ b/rust_ml/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "3" -members = ["lstm_train","mnist_infer","mnist_ml", "regression", "text_classification_infer", "text_classification_news", "text_gen_train"] +members = ["lstm_inference","lstm_train","mnist_infer","mnist_ml", "regression", "regression_inference", "text_classification_infer", "text_classification_news", "text_gen_train"] [workspace.lints.clippy] all = "warn" diff --git a/rust_ml/Dockerfile.lstm_inf b/rust_ml/Dockerfile.lstm_inf new file mode 100644 index 0000000..da26e46 --- /dev/null +++ b/rust_ml/Dockerfile.lstm_inf @@ -0,0 +1,44 @@ +# ----------------------- +# Build Stage +# ----------------------- +FROM ubuntu:16.04 AS builder + +WORKDIR /app/rust_ml + +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +COPY . . + +# Build the release binary for lstm_inference +RUN cargo build --release -p lstm_inference + +# ----------------------- +# Runtime Stage +# ----------------------- +FROM nvidia/vulkan:1.3-470 + +WORKDIR /app + +# Copy compiled binary from builder +COPY --from=builder /app/rust_ml/target/release/lstm_inference /app/binary + +COPY ./model/lstm_train/config.json /app/model/lstm_train/config.json +COPY ./model/lstm_train/model.mpk /app/model/lstm_train/model.mpk + +# Environment variables +ENV RUST_LOG=info +# Setup the default mounted model path +ENV MODEL_DIR=/app/model/lstm_train + +EXPOSE 9050 + +CMD ["./binary"] diff --git a/rust_ml/Dockerfile.regression_inf b/rust_ml/Dockerfile.regression_inf new file mode 100644 index 0000000..2d7c027 --- /dev/null +++ b/rust_ml/Dockerfile.regression_inf @@ -0,0 +1,44 @@ +# ----------------------- +# Build Stage +# ----------------------- +FROM ubuntu:16.04 AS builder + +WORKDIR /app/rust_ml + +RUN apt-get update && apt-get install -y \ + curl \ + build-essential \ + pkg-config \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Copy workspace +# This assumes the build context is the `rust_ml` root directory. +COPY . . + +# Build the release binary for regression_inference +RUN cargo build --release -p regression_inference + +# ----------------------- +# Runtime Stage +# ----------------------- +FROM nvidia/vulkan:1.3-470 + +WORKDIR /app + +# Copy compiled binary from builder +COPY --from=builder /app/rust_ml/target/release/regression_inference /app/binary + +COPY ./model/regression_train/model.bin /app/model/regression_train/model.bin +# Environment variables +ENV RUST_LOG=info +# Setup the default mounted model path +ENV MODEL_PATH=/app/model/regression_train/model.bin + +EXPOSE 9050 + +CMD ["./binary"] diff --git a/rust_ml/lstm_inference/Cargo.toml b/rust_ml/lstm_inference/Cargo.toml new file mode 100644 index 0000000..816d3e7 --- /dev/null +++ b/rust_ml/lstm_inference/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "lstm_inference" +version = "0.1.0" +edition = "2024" + +[dependencies] +lstm_train = { path = "../lstm_train" } +burn = { version = "~0.20", features = ["std", "wgpu"], default-features = false } +axum = "0.8" +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tower-http = { version = "0.6", features = ["cors", "trace"] } + +[lints] +workspace = true diff --git a/rust_ml/lstm_inference/src/inference.rs b/rust_ml/lstm_inference/src/inference.rs new file mode 100644 index 0000000..7f618fe --- /dev/null +++ b/rust_ml/lstm_inference/src/inference.rs @@ -0,0 +1,56 @@ +use axum::{ + extract::State, + http::StatusCode, + Json, +}; +use burn::data::dataloader::batcher::Batcher; +use lstm_train::dataset::{SequenceBatcher, SequenceDatasetItem}; +use serde::Serialize; + +use crate::state::{AppState, MyBackend}; + +#[derive(Serialize)] +pub struct PredictResponse { + pub predicted_next_value: f32, +} + +#[derive(Serialize)] +pub struct ErrorResponse { + pub error: String, +} + +pub async fn predict_handler( + State(state): State, + Json(payload): Json>, +) -> Result, (StatusCode, Json)> { + let device: ::Device = Default::default(); + + // Explicitly construct the dataset mapping bypassing manual target generation for clients + let item = SequenceDatasetItem { + sequence: payload, + target: 0.0, + }; + + // Create batcher mapped to backend + let batcher = SequenceBatcher::default(); + + // Process item into batched tensors + let batch = batcher.batch(vec![item], &device); + + // Perform forward pass inference + let output = state.model.lock().unwrap().forward(batch.sequences, None); + + // Extract single result + let predicted_tensors = output.squeeze_dim::<1>(1).into_data(); + + let predicted_value = predicted_tensors + .as_slice::() + .unwrap_or(&[]) + .first() + .copied() + .unwrap_or(0.0); + + Ok(Json(PredictResponse { + predicted_next_value: predicted_value, + })) +} diff --git a/rust_ml/lstm_inference/src/main.rs b/rust_ml/lstm_inference/src/main.rs new file mode 100644 index 0000000..c71e0da --- /dev/null +++ b/rust_ml/lstm_inference/src/main.rs @@ -0,0 +1,50 @@ +#![recursion_limit = "256"] +mod inference; +mod model; +mod state; + +use axum::{ + routing::{get, post}, + Router, +}; +use std::net::SocketAddr; +use tower_http::cors::CorsLayer; +use tower_http::trace::TraceLayer; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +use crate::inference::predict_handler; +use crate::state::AppState; + +#[tokio::main] +async fn main() { + // Initialize tracing + tracing_subscriber::registry() + .with(tracing_subscriber::EnvFilter::new( + std::env::var("RUST_LOG").unwrap_or_else(|_| "info".into()), + )) + .with(tracing_subscriber::fmt::layer()) + .init(); + + // Load Model State + let model_dir = std::env::var("MODEL_DIR") + .unwrap_or_else(|_| -> String { "model/lstm_train".to_string() }); + + // Let the AppState construct the pre-loaded memory model + let state = AppState::new(&model_dir); + + // Build Axum Router + let app = Router::new() + .route("/health", get(|| async { "OK" })) + .route("/predict", post(predict_handler)) + .layer(CorsLayer::permissive()) + .layer(TraceLayer::new_for_http()) + .with_state(state); + + // Run Server + let port = 9050; + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + tracing::info!("Server listening on http://{}", addr); + + let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); + axum::serve(listener, app).await.unwrap(); +} diff --git a/rust_ml/lstm_inference/src/model.rs b/rust_ml/lstm_inference/src/model.rs new file mode 100644 index 0000000..9537858 --- /dev/null +++ b/rust_ml/lstm_inference/src/model.rs @@ -0,0 +1,380 @@ +use burn::{ + nn::{ + Dropout, DropoutConfig, Initializer, LayerNorm, LayerNormConfig, Linear, LinearConfig, + LstmState, Sigmoid, Tanh, + }, + prelude::*, +}; + +/// LSTM Cell implementation with layer normalization. +/// +/// Mathematical formulation of LSTM: +/// f_t = σ(W_f · [h_{t-1}, x_t] + b_f) # Forget gate +/// i_t = σ(W_i · [h_{t-1}, x_t] + b_i] # Input gate +/// g_t = tanh(W_g · [h_{t-1}, x_t] + b_g] # Candidate cell state +/// o_t = σ(W_o · [h_{t-1}, x_t] + b_o) # Output gate +/// +/// c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t # New cell state +/// h_t = o_t ⊙ tanh(c_t) # New hidden state +/// +/// where: +/// - σ is the sigmoid function +/// - ⊙ is the element-wise multiplication +/// - [h_{t-1}, x_t] represents concatenation + +#[derive(Module, Debug)] +pub struct LstmCell { + pub hidden_size: usize, + // Combined weight matrices for efficiency + // weight_ih layer uses combined weights for [i_t, f_t, g_t, o_t] for input x_t + // weight_hh layer uses combined weights for [i_t, f_t, g_t, o_t] for hidden state h_{t-1} + pub weight_ih: Linear, + pub weight_hh: Linear, + // Layer Normalization for better training stability. Don't use BatchNorm because the input distribution is always changing for LSTM. + pub norm_x: LayerNorm, // Normalize gate pre-activations + pub norm_h: LayerNorm, // Normalize hidden state + pub norm_c: LayerNorm, // Normalize cell state + pub dropout: Dropout, +} + +/// Configuration to create a Lstm module using the init function. +#[derive(Config, Debug)] +pub struct LstmCellConfig { + // The size of the input features + pub input_size: usize, + // The size of the hidden state + pub hidden_size: usize, + // The number of hidden layers + pub dropout: f64, +} + +impl LstmCellConfig { + // Initialize parameters using best practices: + // 1. Orthogonal initialization for better gradient flow (here we use Xavier because of the lack of Orthogonal in burn) + // 2. Initialize forget gate bias to 1.0 to prevent forgetting at start of training + #[allow(clippy::single_range_in_vec_init)] + pub fn init( + &self, + device: &B::Device, + ) -> LstmCell { + let initializer = Initializer::XavierNormal { gain: 1.0 }; + let init_bias = Tensor::::ones([self.hidden_size], device); + + let mut weight_ih = LinearConfig::new(self.input_size, 4 * self.hidden_size) + .with_initializer(initializer.clone()) + .init(device); + // Set forget gate bias to 1.0 (helps with learning long sequences) + let bias = weight_ih + .bias + .clone() + .unwrap() + .val() + .slice_assign([self.hidden_size..2 * self.hidden_size], init_bias.clone()); + weight_ih.bias = weight_ih.bias.map(|p| p.map(|_t| bias)); + + let mut weight_hh = LinearConfig::new(self.hidden_size, 4 * self.hidden_size) + .with_initializer(initializer) + .init(device); + let bias = weight_hh + .bias + .clone() + .unwrap() + .val() + .slice_assign([self.hidden_size..2 * self.hidden_size], init_bias); + weight_hh.bias = weight_hh.bias.map(|p| p.map(|_t| bias)); + + LstmCell { + hidden_size: self.hidden_size, + weight_ih, + weight_hh, + norm_x: LayerNormConfig::new(4 * self.hidden_size).init(device), + norm_h: LayerNormConfig::new(self.hidden_size).init(device), + norm_c: LayerNormConfig::new(self.hidden_size).init(device), + dropout: DropoutConfig::new(self.dropout).init(), + } + } +} + +impl LstmCell { + /// Forward pass of LSTM cell. + /// Args: + /// x: Input tensor of shape (batch_size, input_size) + /// state: Tuple of (h_{t-1}, c_{t-1}) each of shape (batch_size, hidden_size) + /// Returns: + /// Tuple of (h_t, c_t) representing new hidden and cell states + pub fn forward( + &self, + x: Tensor, + state: LstmState, + ) -> LstmState { + let (h_prev, c_prev) = (state.hidden, state.cell); + + // Combined matrix multiplication for all gates + // Shape: (batch_size, 4 * hidden_size) + let gates_x = self.weight_ih.forward(x); // Transform input + let gates_h = self.weight_hh.forward(h_prev); // Transform previous hidden state + + // Apply layer normalization + let gates_x = self.norm_x.forward(gates_x); + // Combined gate pre-activations + let gates = gates_x + gates_h; + + // Split into individual gates + // Each gate shape: (batch_size, hidden_size) + let gates = gates.chunk(4, 1); + let i_gate = gates[0].clone(); + let f_gate = gates[1].clone(); + let g_gate = gates[2].clone(); + let o_gate = gates[3].clone(); + + // Apply gate non-linearities + let i_t = Sigmoid::new().forward(i_gate); + let f_t = Sigmoid::new().forward(f_gate); + let g_t = Tanh::new().forward(g_gate); + let o_t = Sigmoid::new().forward(o_gate); + + // Update cell state: c_t = f_t ⊙ c_{t-1} + i_t ⊙ g_t + let c_t = f_t * c_prev + i_t * g_t; + let c_t = self.norm_c.forward(c_t); + + // Update cell state: h_t = o_t ⊙ tanh(c_t) + let h_t = o_t * Tanh::new().forward(c_t.clone()); + let h_t = self.norm_h.forward(h_t); + + let h_t = self.dropout.forward(h_t); + + LstmState::new(h_t, c_t) + } + + // Initialize cell state and hidden state if provided or with zeros + pub fn init_state( + &self, + batch_size: usize, + device: &B::Device, + ) -> LstmState { + let cell = Tensor::zeros([batch_size, self.hidden_size], device); + let hidden = Tensor::zeros([batch_size, self.hidden_size], device); + + LstmState::new(cell, hidden) + } +} + +/// Stacked LSTM implementation supporting multiple layers +/// Each layer processes the output of the previous layer +#[derive(Module, Debug)] +pub struct StackedLstm { + pub layers: Vec>, +} + +#[derive(Config, Debug)] +pub struct StackedLstmConfig { + pub input_size: usize, + pub hidden_size: usize, + pub num_layers: usize, + pub dropout: f64, +} + +impl StackedLstmConfig { + pub fn init( + &self, + device: &B::Device, + ) -> StackedLstm { + let mut layers: Vec> = vec![]; + // Create list of LSTM cells, one for each layer + for i in 0..self.num_layers { + if i == 0 { + if i < self.num_layers - 1 { + layers.push( + LstmCellConfig::new(self.input_size, self.hidden_size, self.dropout) + .init(device), + ); + } else { + // No dropout on last layer + layers.push( + LstmCellConfig::new(self.input_size, self.hidden_size, 0.0).init(device), + ); + } + } else if i < self.num_layers - 1 { + layers.push( + LstmCellConfig::new(self.hidden_size, self.hidden_size, self.dropout) + .init(device), + ); + } else { + // No dropout on last layer + layers.push( + LstmCellConfig::new(self.hidden_size, self.hidden_size, 0.0).init(device), + ); + } + } + StackedLstm { layers } + } +} + +impl StackedLstm { + /// Process input sequence through stacked LSTM layers. + /// + /// Args: + /// x: Input tensor of shape (batch_size, seq_length, input_size) + /// states: Optional initial states for each layer + /// + /// Returns: + /// Tuple of (output, states) where output has shape (batch_size, seq_length, hidden_size) + /// and states is a vector of length num_layers, both cell and hidden state in each element have shape (batch_size, hidden_size) + pub fn forward( + &self, + x: Tensor, + states: Option>>, + ) -> (Tensor, Vec>) { + let [batch_size, seq_length, _] = x.dims(); + let device = x.device(); + + let mut states = match states { + None => { + let mut temp: Vec> = vec![]; + for layer in self.layers.iter() { + temp.push(layer.init_state(batch_size, &device)); + } + temp + } + _ => states.unwrap(), + }; + + let mut layer_outputs = vec![]; + for t in 0..seq_length { + let mut input_t = x.clone().slice(s![.., t..t + 1, ..]).squeeze_dim::<2>(1); + for (i, lstm_cell) in self.layers.iter().enumerate() { + let mut state: LstmState = + LstmState::new(states[i].cell.clone(), states[i].hidden.clone()); + state = lstm_cell.forward(input_t, state); + input_t = state.hidden.clone(); + states[i] = state; + } + layer_outputs.push(input_t); + } + + // Stack output along sequence dimension + let output = Tensor::stack(layer_outputs, 1); + + (output, states) + } +} + +/// Complete LSTM network with bidirectional support. +/// +/// In bidirectional mode: +/// - Forward LSTM processes sequence from left to right +/// - Backward LSTM processes sequence from right to left +/// - Outputs are concatenated for final prediction +#[derive(Module, Debug)] +pub struct LstmNetwork { + // Forward direction LSTM + pub stacked_lstm: StackedLstm, + // Optional backward direction LSTM for bidirectional processing + pub reverse_lstm: Option>, + pub dropout: Dropout, + pub fc: Linear, +} + +#[derive(Config, Debug)] +pub struct LstmNetworkConfig { + #[config(default = 1)] + pub input_size: usize, // Single feature (number sequence) + #[config(default = 32)] + pub hidden_size: usize, // Size of LSTM hidden state + #[config(default = 2)] + pub num_layers: usize, // Number of LSTM layers + #[config(default = 1)] + pub output_size: usize, // Predict one number + #[config(default = 0.1)] + pub dropout: f64, + #[config(default = true)] + pub bidirectional: bool, // Use bidirectional LSTM +} + +impl LstmNetworkConfig { + pub fn init( + &self, + device: &B::Device, + ) -> LstmNetwork { + // Forward direction LSTM + let stacked_lstm = StackedLstmConfig::new( + self.input_size, + self.hidden_size, + self.num_layers, + self.dropout, + ) + .init(device); + + // Optional backward direction LSTM for bidirectional processing + let (reverse_lstm, hidden_size) = if self.bidirectional { + let lstm = StackedLstmConfig::new( + self.input_size, + self.hidden_size, + self.num_layers, + self.dropout, + ) + .init(device); + (Some(lstm), 2 * self.hidden_size) + } else { + (None, self.hidden_size) + }; + + let fc = LinearConfig::new(hidden_size, self.output_size).init(device); + let dropout = DropoutConfig::new(self.dropout).init(); + + LstmNetwork { + stacked_lstm, + reverse_lstm, + dropout, + fc, + } + } +} + +impl LstmNetwork { + /// Forward pass of the network. + /// + /// For bidirectional processing: + /// 1. Process sequence normally with forward LSTM + /// 2. Process reversed sequence with backward LSTM + /// 3. Concatenate both outputs + /// 4. Apply final linear transformation + /// + /// Args: + /// x: Input tensor of shape (batch_size, seq_length, input_size) + /// states: Optional initial states + /// + /// Returns: + /// Output tensor of shape (batch_size, output_size) + pub fn forward( + &self, + x: Tensor, + states: Option>>, + ) -> Tensor { + let seq_length = x.dims()[1]; + // Forward direction + let (mut output, _states) = self.stacked_lstm.forward(x.clone(), states); + + output = match &self.reverse_lstm { + Some(reverse_lstm) => { + //Process sequence in reverse direction + let (mut reverse_output, _states) = reverse_lstm.forward(x.flip([1]), None); + // Flip back to align with forward sequence + reverse_output = reverse_output.flip([1]); + // Concatenate forward and backward outputs along the feature dimension + output = Tensor::cat(vec![output, reverse_output], 2); + output + } + None => output, + }; + + // Apply dropout before final layer + output = self.dropout.forward(output); + // Use final timestep output for prediction + self.fc.forward( + output + .slice(s![.., seq_length - 1..seq_length, ..]) + .squeeze_dim::<2>(1), + ) + } +} diff --git a/rust_ml/lstm_inference/src/state.rs b/rust_ml/lstm_inference/src/state.rs new file mode 100644 index 0000000..ec06bff --- /dev/null +++ b/rust_ml/lstm_inference/src/state.rs @@ -0,0 +1,44 @@ +use burn::{ + backend::Wgpu, + module::Module, + prelude::Config, + record::{CompactRecorder, Recorder}, +}; +use crate::model::{LstmNetwork, LstmNetworkConfig}; +use std::sync::{Arc, Mutex}; + +pub type MyBackend = Wgpu; + +#[derive(Config,Debug)] +pub struct InferenceConfig { + pub model: LstmNetworkConfig, +} + +#[derive(Clone)] +pub struct AppState { + pub model: Arc>>, +} + +impl AppState { + pub fn new(model_dir: &str) -> Self { + let device = Default::default(); + + let config_path = format!("{}/config.json", model_dir); + let model_path = format!("{}/model", model_dir); + + // Load training configuration + let config = InferenceConfig::load(&config_path) + .expect("Config should exist for the model; run train first"); + + // Load model configuration and initialized layers + let record = CompactRecorder::new() + .load(model_path.into(), &device) + .expect("Trained model should exist; run train first"); + + let model: LstmNetwork = config.model.init(&device).load_record(record); + + Self { + model: Arc::new(Mutex::new(model)), + } + } +} diff --git a/rust_ml/mnist_ml/src/main.rs b/rust_ml/mnist_ml/src/main.rs index 03c994e..ac10cc6 100644 --- a/rust_ml/mnist_ml/src/main.rs +++ b/rust_ml/mnist_ml/src/main.rs @@ -1,3 +1,4 @@ +#![recursion_limit = "256"] mod data; mod model; mod training; diff --git a/rust_ml/regression_inference/Cargo.toml b/rust_ml/regression_inference/Cargo.toml new file mode 100644 index 0000000..afea4d1 --- /dev/null +++ b/rust_ml/regression_inference/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "regression_inference" +version = "0.1.0" +edition = "2024" + +[dependencies] +regression = { path = "../regression" } +burn = { version = "~0.20", features = ["std", "wgpu"], default-features = false } +axum = "0.8" +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tower-http = { version = "0.6", features = ["cors", "trace"] } + +[lints] +workspace = true diff --git a/rust_ml/regression_inference/src/inference.rs b/rust_ml/regression_inference/src/inference.rs new file mode 100644 index 0000000..37f985a --- /dev/null +++ b/rust_ml/regression_inference/src/inference.rs @@ -0,0 +1,51 @@ +use axum::{ + extract::State, + http::StatusCode, + Json, +}; +use burn::data::dataloader::batcher::Batcher; +use regression::dataset::{HousingBatcher, HousingDistrictItem}; +use serde::Serialize; + +use crate::state::{AppState, Backend}; + +#[derive(Serialize)] +pub struct PredictResponse { + pub predicted_median_house_value: f32, +} + +#[derive(Serialize)] +pub struct ErrorResponse { + pub error: String, +} + +pub async fn predict_handler( + State(state): State, + Json(payload): Json, +) -> Result, (StatusCode, Json)> { + let device: ::Device = Default::default(); + + // Create batcher mapped to backend + let batcher = HousingBatcher::::new(device.clone()); + + // Process item + // Note: HousingBatcher::batch transforms a Vec into a HousingBatch + let batch = batcher.batch(vec![payload], &device); + + // Perform forward pass inference + let output = state.model.lock().unwrap().forward(batch.inputs); + + // Extract single result + let predicted_tensors = output.squeeze_dim::<1>(1).into_data(); + + // Assuming `into_data()` gives us Burn's generic `Data`, we extract f32 value. + // Since we batched a single item, it should be the first entry + let predicted_value = predicted_tensors + .iter::() + .next() + .unwrap_or(0.0); + + Ok(Json(PredictResponse { + predicted_median_house_value: predicted_value, + })) +} diff --git a/rust_ml/regression_inference/src/main.rs b/rust_ml/regression_inference/src/main.rs new file mode 100644 index 0000000..d410b9e --- /dev/null +++ b/rust_ml/regression_inference/src/main.rs @@ -0,0 +1,50 @@ +mod inference; +mod model; +mod state; + +use axum::{ + routing::{get, post}, + Router, +}; +use std::net::SocketAddr; +use tower_http::cors::CorsLayer; +use tower_http::trace::TraceLayer; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +use crate::inference::predict_handler; +use crate::state::AppState; + +#[tokio::main] +async fn main() { + // Initialize tracing + tracing_subscriber::registry() + .with(tracing_subscriber::EnvFilter::new( + std::env::var("RUST_LOG").unwrap_or_else(|_| "info".into()), + )) + .with(tracing_subscriber::fmt::layer()) + .init(); + + // Load Model State + // In Docker, it'll mount to /app/model. mpk is the default extension from burn's NoStdTrainingRecorder + let model_path = std::env::var("MODEL_PATH") + .unwrap_or_else(|_| -> String { "model/regression_train/model.bin".to_string() }); + + // Let the AppState construct the pre-loaded memory model + let state = AppState::new(&model_path); + + // Build Axum Router + let app = Router::new() + .route("/health", get(|| async { "OK" })) + .route("/predict", post(predict_handler)) + .layer(CorsLayer::permissive()) + .layer(TraceLayer::new_for_http()) + .with_state(state); + + // Run Server + let port = 9050; + let addr = SocketAddr::from(([0, 0, 0, 0], port)); + tracing::info!("Server listening on http://{}", addr); + + let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); + axum::serve(listener, app).await.unwrap(); +} diff --git a/rust_ml/regression_inference/src/model.rs b/rust_ml/regression_inference/src/model.rs new file mode 100644 index 0000000..29ffed0 --- /dev/null +++ b/rust_ml/regression_inference/src/model.rs @@ -0,0 +1,49 @@ +use burn::{ + nn::{Linear, LinearConfig, Relu}, + prelude::*, +}; +use regression::dataset::NUM_FEATURES; + +#[derive(Module, Debug)] +pub struct RegressionModel { + input_layer: Linear, + output_layer: Linear, + activation: Relu, +} + +#[derive(Config, Debug)] +pub struct RegressionModelConfig { + #[config(default = 64)] + pub hidden_size: usize, +} + +impl RegressionModelConfig { + pub fn init( + &self, + device: &B::Device, + ) -> RegressionModel { + let input_layer = LinearConfig::new(NUM_FEATURES, self.hidden_size) + .with_bias(true) + .init(device); + let output_layer = LinearConfig::new(self.hidden_size, 1) + .with_bias(true) + .init(device); + + RegressionModel { + input_layer, + output_layer, + activation: Relu::new(), + } + } +} + +impl RegressionModel { + pub fn forward( + &self, + input: Tensor, + ) -> Tensor { + let x = self.input_layer.forward(input); + let x = self.activation.forward(x); + self.output_layer.forward(x) + } +} diff --git a/rust_ml/regression_inference/src/state.rs b/rust_ml/regression_inference/src/state.rs new file mode 100644 index 0000000..c5a8776 --- /dev/null +++ b/rust_ml/regression_inference/src/state.rs @@ -0,0 +1,33 @@ +use burn::{ + backend::Wgpu, + module::Module, + record::{NoStdTrainingRecorder, Recorder}, +}; +use crate::model::{RegressionModel, RegressionModelConfig, RegressionModelRecord}; +use std::sync::{Arc, Mutex}; + +pub type Backend = Wgpu; + +#[derive(Clone)] +pub struct AppState { + pub model: Arc>>, +} + +impl AppState { + pub fn new(model_path: &str) -> Self { + let device = Default::default(); + + // Load model configuration + let record: RegressionModelRecord = NoStdTrainingRecorder::new() + .load(model_path.into(), &device) + .expect("Failed to load model record. Ensure the model is trained."); + + let model = RegressionModelConfig::new() + .init(&device) + .load_record(record); + + Self { + model: Arc::new(Mutex::new(model)), + } + } +}