cvpr2026

seungchan-kim · seungchan-kim · commit ec61768f2958 · 2026-02-24T09:53:07.000-05:00
diff --git a/_bibliography/references.bib b/_bibliography/references.bib
@@ -1,3 +1,35 @@
+@inproceedings{karhade2026any4d,
+	title        = {{Any4D}: Unified Feed-Forward Metric 4D Reconstruction},
+	author       = {Karhade, Jay and Keetha, Nikhil and Zhang, Yuchen and Gupta, Tanisha and Sharma, Akash and Scherer, Sebastian and Ramanan, Deva},
+	year         = {2026},
+	booktitle    = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+	url          = {https://any-4d.github.io/},
+	abstract     = {We present Any4D, a scalable multi-view transformer for metric-scale, dense feed-forward 4D reconstruction. Any4D directly generates per-pixel motion and geometry predictions for N frames, in contrast to prior work that typically focuses on either 2-view dense scene flow or sparse 3D point tracking. Moreover, unlike other recent methods for 4D reconstruction from monocular RGB videos, Any4D can process additional modalities and sensors such as RGB-D frames, IMU-based egomotion, and Radar Doppler measurements, when available. One of the key innovations that allows for such a flexible framework is a modular representation of a 4D scene; specifically, per-view 4D predictions are encoded using a variety of egocentric factors (depthmaps and camera intrinsics) represented in local camera coordinates, and allocentric factors (camera extrinsics and scene flow) represented in global world coordinates. We achieve superior performance across diverse setups - both in terms of accuracy (2-3X lower error) and compute efficiency (15X faster) - opening avenues for multiple downstream applications.}
+}
+@inproceedings{chen2026cometokens,
+	title	  	 = {Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers},
+	author 		 = {Chen, Yutian and Qiu, Yuheng and Li, Ruogu and Agha, Ali and Omidshafiei, Shayegan and Patrikar, Jay and Scherer, Sebastian},
+	year 	     = {2026},
+	url          = {https://arxiv.org/abs/2511.14751},
+	booktitle    = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+	abstract 	 = {We propose Confidence-Guided Token Merging (Co-Me), an acceleration mechanism for visual geometric transformers without retraining or finetuning the base model. Co-Me distilled a light-weight confidence predictor to rank tokens by uncertainty and selectively merge low-confidence ones, effectively reducing computation while maintaining spatial coverage. Compared to similarity-based merging or pruning, the confidence signal in Co-Me reliably indicates regions emphasized by the transformer, enabling substantial acceleration without degrading performance. Co-Me applies seamlessly to various multi-view and streaming visual geometric transformers, achieving speedups that scale with sequence length. When applied to VGGT and MapAnything, Co-Me achieves up to 11.3x and 7.2x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.}
+}
+@inproceedings{yu2026unified,
+	title        = {Unified Spherical Frontend: Learning Rotation-Equivariant Representations of Spherical Images from Any Camera},
+	author       = {Mukai Yu and Mosam Dabhi and Liuyue Xie and Sebastian Scherer and L{\'a}szl{\'o} A. Jeni},
+	year         = {2026},
+	booktitle    = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+	url          = {https://arxiv.org/pdf/2511.18174},
+	abstract     = {Modern perception increasingly relies on fisheye, panoramic, and other wide field-of-view (FoV) cameras, yet most pipelines still apply planar CNNs designed for pinhole imagery on 2D grids, where image-space neighborhoods misrepresent physical adjacency and models are sensitive to global rotations. Frequency-domain spherical CNNs partially address this mismatch but require costly spherical harmonic transforms that constrain resolution and efficiency. We introduce the Unified Spherical Frontend (USF), a lens-agnostic framework that transforms images from any calibrated camera into a unit-sphere representation via ray-direction correspondences, and performs spherical resampling, convolution, and pooling directly in the spatial domain. USF is modular: projection, location sampling, interpolation, and resolution control are fully decoupled. Its distance-only spherical kernels offer configurable rotation-equivariance (mirroring translation-equivariance in planar CNNs) while avoiding harmonic transforms entirely. We compare standard planar backbones with their spherical counterparts across classification, detection, and segmentation tasks on synthetic (Spherical MNIST) and real-world datasets (PANDORA, Stanford 2D-3D-S), and stress-test robustness to extreme lens distortions, varying FoV, and arbitrary rotations. USF processes high-resolution spherical imagery efficiently and maintains less than 1\% performance drop under random test-time rotations, even without rotational augmentation, and even enables zero-shot generalization from one lens type to unseen wide-FoV lenses with minimal performance degradation.}
+}
+@inproceedings{alama2026radseg,
+	title 		 = {RADSeg: Unleashing Parameter and Compute Efficient Zero-Shot Open-Vocabulary Segmentation Using Agglomerative Models},
+	author 		 = {Alama, Omar and Jariwala, Darshil and Bhattacharya, Avigyan and Kim, Seungchan and Wang, Wenshan and Scherer, Sebastian},
+	year 		 = {2026},
+	booktitle    = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), Findings},
+	url			 = {https://arxiv.org/abs/2511.19704},
+	abstract     = {Open-vocabulary semantic segmentation (OVSS) underpins many vision and robotics tasks that require generalizable semantic understanding. Existing approaches either rely on limited segmentation training data, which hinders generalization, or apply zero-shot heuristics to vision-language models (e.g CLIP), while the most competitive approaches combine multiple models to improve performance at the cost of high computational and memory demands. In this work, we leverage an overlooked agglomerative vision foundation model, RADIO, to improve zero-shot OVSS along three key axes simultaneously: mIoU, latency, and parameter efficiency. We present the first comprehensive study of RADIO for zero-shot OVSS and enhance its performance through self-correlating recursive attention, self-correlating global aggregation, and computationally efficient mask refinement. Our approach, RADSeg, achieves 6-30% mIoU improvement in the base ViT class while being 3.95x faster and using 2.5x fewer parameters. Surprisingly, RADSeg-base (105M) outperforms previous combinations of huge vision models (850-1350M) in mIoU, achieving state-of-the-art accuracy with substantially lower computational and memory cost.}
+}
 @inproceedings{kim2026raven,
 	title 		 = {RAVEN: Resilient Aerial Navigation via Open-Set Semantic Memory and Behavior Adaptation},
 	author 	     = {Kim, Seungchan and Alama, Omar and Kurdydyk, Dmytro and Keetha, Nikhil and Wang, Wenshan and Bisk, Yonatan and Scherer, Sebastian},
@@ -29,14 +61,6 @@ @inproceedings{keetha2026mapanything
 	organization = {IEEE},
 	abstract     = {We introduce MapAnything, a unified transformer-based feed-forward model that ingests one or more images along with optional geometric inputs such as camera intrinsics, poses, depth, or partial reconstructions, and then directly regresses the metric 3D scene geometry and cameras. MapAnything leverages a factored representation of multi-view scene geometry, i.e., a collection of depth maps, local ray maps, camera poses, and a metric scale factor that effectively upgrades local reconstructions into a globally consistent metric frame. Standardizing the supervision and training across diverse datasets, along with flexible input augmentation, enables MapAnything to address a broad range of 3D vision tasks in a single feed-forward pass, including uncalibrated structure-from-motion, calibrated multi-view stereo, monocular depth estimation, camera localization, depth completion, and more. We provide extensive experimental analyses and model ablations demonstrating that MapAnything outperforms or matches specialist feed-forward models while offering more efficient joint training behavior, thus paving the way toward a universal 3D reconstruction backbone.}
 }
-@misc{karhade2025any4d,
-	title        = {{Any4D}: Unified Feed-Forward Metric 4D Reconstruction},
-	author       = {Karhade, Jay and Keetha, Nikhil and Zhang, Yuchen and Gupta, Tanisha and Sharma, Akash and Scherer, Sebastian and Ramanan, Deva},
-	year         = {2025},
-	journal      = {arXiv preprint arXiv:2512.10935},
-	url          = {https://any-4d.github.io/},
-	abstract     = {We present Any4D, a scalable multi-view transformer for metric-scale, dense feed-forward 4D reconstruction. Any4D directly generates per-pixel motion and geometry predictions for N frames, in contrast to prior work that typically focuses on either 2-view dense scene flow or sparse 3D point tracking. Moreover, unlike other recent methods for 4D reconstruction from monocular RGB videos, Any4D can process additional modalities and sensors such as RGB-D frames, IMU-based egomotion, and Radar Doppler measurements, when available. One of the key innovations that allows for such a flexible framework is a modular representation of a 4D scene; specifically, per-view 4D predictions are encoded using a variety of egocentric factors (depthmaps and camera intrinsics) represented in local camera coordinates, and allocentric factors (camera extrinsics and scene flow) represented in global world coordinates. We achieve superior performance across diverse setups - both in terms of accuracy (2-3X lower error) and compute efficiency (15X faster) - opening avenues for multiple downstream applications.}
-}
 @inproceedings{zhang2025ufm,
 	title        = {{UFM}: A Simple Path towards Unified Dense Correspondence with Flow},
 	author       = {Zhang, Yuchen and Keetha, Nikhil and Lyu, Chenwei and Jhamb, Bhuvan and Chen, Yutian and Qiu, Yuheng and Karhade, Jay and Jha, Shreyas and Hu, Yaoyu and Ramanan, Deva and others},
@@ -53,36 +77,6 @@ @article{he2025grndctrl
 	journal 	 = {arXiv preprint arXiv:2512.01952},
 	abstract     = {Recent advances in video world modeling have enabled large-scale generative models to simulate embodied environments with high visual fidelity, providing strong priors for prediction, planning, and control. Yet, despite their realism, these models often lack geometric grounding, limiting their use in navigation tasks that require spatial coherence and long-horizon stability. We introduce Reinforcement Learning with World Grounding (RLWG), a self-supervised post-training framework that aligns pretrained world models with a physically verifiable structure through geometric and perceptual rewards. Analogous to reinforcement learning from verifiable feedback (RLVR) in language models, RLWG can use multiple rewards that measure pose cycle-consistency, depth reprojection, and temporal coherence. We instantiate this framework with GrndCtrl, a reward-aligned adaptation method based on Group Relative Policy Optimization (GRPO), yielding world models that maintain stable trajectories, consistent geometry, and reliable rollouts for embodied navigation. Like post-training alignment in large language models, GrndCtrl leverages verifiable rewards to bridge generative pretraining and grounded behavior, achieving superior spatial coherence and navigation stability over supervised fine-tuning in outdoor environments.}
 }
-@article{alama2025radseg,
-	title 		 = {RADSeg: Unleashing Parameter and Compute Efficient Zero-Shot Open-Vocabulary Segmentation Using Agglomerative Models},
-	author 		 = {Alama, Omar and Jariwala, Darshil and Bhattacharya, Avigyan and Kim, Seungchan and Wang, Wenshan and Scherer, Sebastian},
-	year 		 = {2025},
-	url			 = {https://arxiv.org/abs/2511.19704},
-	journal 	 = {arXiv preprint arXiv:2511.19704},
-	abstract     = {Open-vocabulary semantic segmentation (OVSS) underpins many vision and robotics tasks that require generalizable semantic understanding. Existing approaches either rely on limited segmentation training data, which hinders generalization, or apply zero-shot heuristics to vision-language models (e.g CLIP), while the most competitive approaches combine multiple models to improve performance at the cost of high computational and memory demands. In this work, we leverage an overlooked agglomerative vision foundation model, RADIO, to improve zero-shot OVSS along three key axes simultaneously: mIoU, latency, and parameter efficiency. We present the first comprehensive study of RADIO for zero-shot OVSS and enhance its performance through self-correlating recursive attention, self-correlating global aggregation, and computationally efficient mask refinement. Our approach, RADSeg, achieves 6-30% mIoU improvement in the base ViT class while being 3.95x faster and using 2.5x fewer parameters. Surprisingly, RADSeg-base (105M) outperforms previous combinations of huge vision models (850-1350M) in mIoU, achieving state-of-the-art accuracy with substantially lower computational and memory cost.}
-}
-@article{chen2025cometokens,
-	title	  	 = {Co-Me: Confidence-Guided Token Merging for Visual Geometric Transformers},
-	author 		 = {Chen, Yutian and Qiu, Yuheng and Li, Ruogu and Agha, Ali and Omidshafiei, Shayegan and Patrikar, Jay and Scherer, Sebastian},
-	year 	     = {2025},
-	url          = {https://arxiv.org/abs/2511.14751},
-	journal      = {arXiv preprint arXiv:2511.14751},
-	abstract 	 = {We propose Confidence-Guided Token Merging (Co-Me), an acceleration mechanism for visual geometric transformers without retraining or finetuning the base model. Co-Me distilled a light-weight confidence predictor to rank tokens by uncertainty and selectively merge low-confidence ones, effectively reducing computation while maintaining spatial coverage. Compared to similarity-based merging or pruning, the confidence signal in Co-Me reliably indicates regions emphasized by the transformer, enabling substantial acceleration without degrading performance. Co-Me applies seamlessly to various multi-view and streaming visual geometric transformers, achieving speedups that scale with sequence length. When applied to VGGT and MapAnything, Co-Me achieves up to 11.3x and 7.2x speedup, making visual geometric transformers practical for real-time 3D perception and reconstruction.}
-}
-@misc{yu2025unified,
-	title        = {Unified Spherical Frontend: Learning Rotation-Equivariant Representations of Spherical Images from Any Camera},
-	shorttitle   = {Unified Spherical Frontend},
-	author       = {Mukai Yu and Mosam Dabhi and Liuyue Xie and Sebastian Scherer and L{\'a}szl{\'o} A. Jeni},
-	year         = {2025},
-	publisher    = {arXiv},
-	doi          = {10.48550/arXiv.2511.18174},
-	url          = {https://arxiv.org/pdf/2511.18174},
-	eprint       = {2511.18174},
-	primaryclass = {cs.CV},
-	abstract     = {Modern perception increasingly relies on fisheye, panoramic, and other wide field-of-view (FoV) cameras, yet most pipelines still apply planar CNNs designed for pinhole imagery on 2D grids, where image-space neighborhoods misrepresent physical adjacency and models are sensitive to global rotations. Frequency-domain spherical CNNs partially address this mismatch but require costly spherical harmonic transforms that constrain resolution and efficiency. We introduce the Unified Spherical Frontend (USF), a lens-agnostic framework that transforms images from any calibrated camera into a unit-sphere representation via ray-direction correspondences, and performs spherical resampling, convolution, and pooling directly in the spatial domain. USF is modular: projection, location sampling, interpolation, and resolution control are fully decoupled. Its distance-only spherical kernels offer configurable rotation-equivariance (mirroring translation-equivariance in planar CNNs) while avoiding harmonic transforms entirely. We compare standard planar backbones with their spherical counterparts across classification, detection, and segmentation tasks on synthetic (Spherical MNIST) and real-world datasets (PANDORA, Stanford 2D-3D-S), and stress-test robustness to extreme lens distortions, varying FoV, and arbitrary rotations. USF processes high-resolution spherical imagery efficiently and maintains less than 1\% performance drop under random test-time rotations, even without rotational augmentation, and even enables zero-shot generalization from one lens type to unseen wide-FoV lenses with minimal performance degradation.},
-	archiveprefix = {arXiv},
-	keywords     = {Computer Science - Computer Vision and Pattern Recognition}
-}
 @inproceedings{alama2025rayfronts,
 	title        = {RayFronts: Open-Set Semantic Ray Frontiers for Online Scene Understanding and Exploration},
 	author       = {Omar Alama and Avigyan Bhattacharya and Haoyang He and Seungchan Kim and Yuheng Qiu and Wenshan Wang and Cherie Ho and Nikhil Keetha and Sebastian Scherer},