@article{MittalMaJoNa25,title={A Planning Framework for Adaptive Labeling},author={Mittal, Daksh and Ma, Yuanzhe and Joshi, Shalmali and Namkoong, Hongseok},journal={arXiv:2502.06076 [stat.ML]},year={2025},note={Journal version under review; Conference version appeared in NeurIPS 2024},url={http://arxiv.org/abs/2502.06076}}
@article{ZhangCaNaRu25,title={Contextual Thompson Sampling via Generation of Missing Data},author={Zhang, Kelly and Cai, Tiffany and Namkoong, Hongseok and Russo, Daniel},journal={arXiv:2502.07064 [cs.LG]},year={2025},url={https://arxiv.org/abs/2502.07064}}
@inproceedings{HsuDiSiNa24,title={From Models to Systems: A Comprehensive Fairness Framework for Compositional Recommender Systems},author={Hsu, Brian and DiCiccio, Cyrus and Sivasubramoniapillai, Natesh and Namkoong, Hongseok},year={2024},booktitle={Proceedings of Machine Learning Research},url={https://arxiv.org/abs/2412.04655},}
@article{ZengLiLaNa24,title={{LLM} Embeddings Improve Test-time Adaptation to Tabular {$Y|X$}-Shifts},author={Zeng$*$, Yibo and Liu$*$, Jiashuo and Lam, Henry and Namkoong, Hongseok},journal={arXiv:2410.07395 [cs.LG]},year={2024},url={https://arxiv.org/abs/2410.07395},}
@inproceedings{ChenLiChPeDoNa24,title={{QGym}: Scalable Simulation and Benchmarking of Queuing Network Controllers},author={Chen, Haozhe and Li, Ang and Che, Ethan and Peng, Tianyi and Dong, Jing and Namkoong, Hongseok},booktitle={Advances in Neural Information Processing Systems 37, Datasets and Benchmark Track},year={2024},url={https://arxiv.org/abs/2410.06170},}
@inproceedings{ZolloSiYeLiNa24,title={{PersonalLLM}: Tailoring {LLMs} to Individual Preferences},author={Zollo$*$, Thomas and Siah$*$, Andrew and Ye, Naimeng and Li, Ang and Namkoong, Hongseok},booktitle={In International Conference on Learning Representations, 2025},year={2025},url={https://www.arxiv.org/abs/2409.20296},}
Selected for oral presentations at the Econometric Society
Interdisciplinary Frontiers: Economics and AI+ML
conference and Conference on Digital Experimentation
Adaptivity
can significantly improve efficiency of experimentation, but it is challenging to implement even at large
online platforms with mature experimentation systems.
As a result, many real-world
experiments are deliberately implemented with large batches and a handful of
opportunities to update the sampling allocation as a way to reduce operational
costs of experimentation.
In this work, we focus on adaptive experiments with limited adaptivity (short horizons T < 10). Bandit algorithms focusing on long-horizon settings are tailored to provide regret guarantees for each specific case, and we find they often underperform
static A/B tests on practical problem instances with
batched feedback, non-stationarity, multiple objectives and constraints, and
personalization.
In response, we develop a mathematical programming framework for
developing adaptive experimentation algorithms. Instead of the
problem-specific research paradigm (akin to an optimization solver developed
for a particular linear program), we ask the modeler to write down a flexible
optimization formulation and use modern machine learning systems to
(heuristically) solve for adaptive designs.
Since a naive formulation of the adaptive
experimentation problem as a dynamic program is intractable,
we propose a batched view of the experimentation process. We model the uncertainty around
batch-level sufficient
statistics necessary to make allocation decisions, instead of attempting to
model unit-level outcomes whose distributions are commonly unknown and leads
to intractable dynamic programs with combinatorial action spaces.
Sequential Gaussian approximations is the main intellectual vehicle
powering our mathematical programming framework. CLT-based normal approximations are universal in statistical
inference, and a sequential variant we prove provides a simple optimization formulation that lends itself to modern computational tools. Through extensive empirical
evaluation, we observe that even a preliminary and heuristic solution
approach can provide major robustness benefits. Unlike bespoke methods (e.g.,
Thompson sampling variants), our mathematical programming framework provides
consistent gains over static randomized control trials and exhibits robust
performance across problem instances.
@article{CheJiNaWa24,title={Optimization-Driven Adaptive Experimentation},author={Che, Ethan and Jiang, Daniel and Namkoong, Hongseok and Wang, Jimmy},journal={arXiv:2408.04570 [cs.LG]},year={2024},note={Selected for oral presentations at the Econometric Society
Interdisciplinary Frontiers: Economics and AI+ML
conference and Conference on Digital Experimentation},url={https://arxiv.org/abs/2408.04570},}
@article{WangChJiNa24,title={{AExGym}: Benchmarks and Environments for Adaptive Experimentation},author={Wang, Jimmy and Che, Ethan and Jiang, Daniel and Namkoong, Hongseok},journal={arXiv:2408.04531 [cs.LG]},year={2024},url={https://arxiv.org/abs/2408.04531},}
AI models are omni-present yet extrapolate in unexpected ways,
posing a significant barrier to robust and fair systems.
Building AI systems that can articulate their own uncertainty has been
a longstanding challenge in ML, such probabilistic reasoning capability
is key to bounding downside risk (e.g., delegating to human experts) and
continually improving system performance by gathering data to resolve uncertainty.
Despite recent advances in large language models, uncertainty quantification remains a
challenge, with methods attempting to leverage these deep neural networks—such as Bayesian
neural networks—frequently facing scalability limitations.
This work takes an important conceptual step towards building large-scale
AI systems that can reason about uncertainty through natural language.
We revisit De Finetti’s view of uncertainty coming from missing observations rather
than latent parameters, which allows us to pose learning to do statistical inference
as a prediction problem involving masked inputs. This formal connection between
autoregressive generation with probabilistic reasoning allows pre-trained sequence
models to express their epistemic uncertainty on underlying concepts, and refine
their beliefs as they gather more information.
Our findings open a promising avenue for addressing uncertainty in complex,
data-rich settings in a scalable way. We are excited by how this work leverages
a timeless insight to inform a timely topic: guiding the next generation of AI systems.
1. As internet data depletes, the pace of progress in LLM capabilities has been widely
observed to slow down (even in public media). This suggests that the limited
paradigm of pre-training on passively scraped web data has reached its full potential.
To move forward, the authors believe that the next generation of AI systems
must be able to understand tasks on which they suffer high uncertainty, and
actively gather data in order to continually improve their performance.
2. Since scalable uncertainty quantification poses a key intellectual bottleneck,
we resolve this by going back to De Finetti’s insight developed in the 1920s.
We believe the connection between Bayesian inference and autoregressive generation provides
the groundwork for building LLMs with probabilistic reasoning capabilities.
Taken together, our work showcases how principled scientific insights have the
potential to shape the design of even the largest scale AI systems.
@article{YeNa24,title={Exchangeable Sequence Models Quantify Uncertainty Over Latent Concepts},author={Ye, Naimeng and Namkoong, Hongseok},journal={arXiv:2408.03307 [stat.ML]},year={2024},url={https://arxiv.org/abs/2408.03307},selected=true}
Causal inference provides the foundation of decision-making in sciences and industry alike,
and our work addresses a longstanding gap between practical performance and theoretical guarantees in
causal inference. Machine learning-based methods can provide a powerful way to control for confounding,
and the de facto standard approach is to use debiased estimators, which enjoy guarantees like statistical
efficiency and double robustness; examples include one-step
estimation (i.e. augmented inverse propensity weighting (AIPW)) and targeted
maximum likelihood estimation (TMLE).
However, in practice, these estimators have been observed to be unstable when there is
limited overlap between treatment and control, necessitating ad hoc adjustments
such as truncating propensity scores. In contrast, naive plug-in estimators
using an ML model can be more stable but lack these desirable asymptotic properties.
This trade-off can make it difficult to choose an estimator and ultimately,
to reach a conclusion regarding the treatment effect.
We propose a novel framework that combines the best of both worlds:
we derive the best plug-in estimator that is debiased,
retaining the stability of plug-ins while enjoying statistical efficiency and double robustness.
Our estimation framework is based on a constrained optimization problem and
can incorporate flexible modern ML techniques, including controlling for text-based confounders
using LLMs. Empirically, we demonstrate our approach over a range of examples,
and observe that it outperforms standard debiased methods when there is limited overlap.
As low overlap settings are a persistent challenge in practice,
we expect these results will be of interest to a broad spectrum of researchers,
including practitioners in statistics, economics, and machine learning.
We are unusually excited by how our framework provides a novel and pragmatic approach
to a longstanding challenge in causal inference.
By introducing an entirely new constrained optimization framework for semiparametric estimation, we hope to spur further progress in developing robust but theoretically grounded estimators.
@article{CaiFoHoNa24,title={Constrained Learning for Causal Inference and Semiparametric Statistics},author={Cai$*$, Tiffany and Fonseca$*$, Yuri and Hou, Kaiwen and Namkoong, Hongseok},journal={arXiv:2405.09493 [stat.ML]},year={2024},url={https://arxiv.org/abs/2405.09493}}
@article{CaiNaRuZh25,title={Active Exploration via Autoregressive Generation of Missing Data},author={Cai, Tiffany and Namkoong, Hongseok and Russo, Daniel and Zhang, Kelly},journal={arXiv:2405.19466 [cs.LG]},year={2025},note={Selected for presentation at the Econometric Society
Interdisciplinary Frontiers: Economics and AI+ML
conference},url={https://arxiv.org/abs/2405.19466},}
Recent advances in AI present significant opportunities to
rethink the design of service systems with AI at the
forefront. Even in the era of LLMs, managing a
workforce of human agents (“servers”) is a crit-
ical problem. Crowdsourcing workers are vital for
aligning LLMs with human values (e.g., ChatGPT) and
in many domains, the cost of human annotation is a
binding constraint (e.g., medical diagnosis from
radiologists). This work models and analyzes modern
service systems involving human reviewers and
state-of-the-art AI models. A key intellectual
challenge in managing con- gestion within such
service systems is endogeneity. Prediction is never
the goal, and the link between predictive
performance and downstream decision-making
performance is not straightforward due to
endogeneity. Our work crystallizes how classical
tools from queueing theory provide managerial
insights into the design of AI-based service
systems.
@article{LeeNaZe24,title={Design and Scheduling of an AI-based Queueing System},author={Lee, Jiung and Namkoong, Hongseok and Zeng, Yibo},year={2024},journal={arXiv:2406.06855 [math.OC]},url={https://arxiv.org/abs/2406.06855}}
Different distribution shifts require different interventions, and algorithms must be grounded in the specific shifts they address. Advocating for an inductive approach to research on distributional robustness, we build an empirical testbed, "WhyShift", comprising of natural shifts across 5 tabular datasets and 60,000 model configurations encompassing imbalanced learning algorithms and distributionally robust optimization (DRO) methods. We find Y|X-shifts are most prevalent on our testbed, in stark contrast to the heavy focus on X (covariate)-shifts in the ML literature. We conduct
an in-depth empirical analysis of DRO methods and find that the underlying model class (e.g.,
neural networks, XGBoost) and hyperparameter selection have a first-order impact in practice
despite being overlooked by DRO researchers. To further bridge that gap between methodological
research and practice, we design case studies that illustrate how such a refined understanding of
distribution shifts can enhance both data-centric and algorithmic interventions.
@article{LiuWaCuNa24,title={On the Need for a Language Describing Distribution Shifts: Illustrations on Tabular Datasets},author={Liu$*$, Jiashuo and Wang$*$, Tianyu and Cui, Peng and Namkoong, Hongseok},year={2024},journal={arXiv:2307.05284 [cs.LG]},url={https://arxiv.org/abs/2307.05284},note={Conference version appeared in NeurIPS 2023.},}
Starting with my one-year stint at Meta’s adaptive
experimentation team, I’ve been pondering on how
bandit algorithms are largely designed by
theoreticians to achieve good regret bounds and are
rarely used in practice due to the difficulty of
implementation and poor empirical performance. In
this work, we focus on underpowered, short-horizon,
and large-batch problems that typically arise in
practice. We use large batch normal approximations
to derive an MDP formulation for deriving the
optimal adaptive design. Our formulation allows the
use of computational tools for designing adaptive
algorithms, a break from the existing theory-driven
paradigm.
Our approach significantly improves statistical power over standard
methods, even when compared to Bayesian bandit algorithms
(e.g., Thompson sampling) that require full distributional knowledge
of individual rewards. Overall, we expand the scope of
adaptive experimentation to settings that are difficult
for standard methods, involving limited adaptivity,
low signal-to-noise ratio, and unknown reward distributions.
@article{CheNa23,title={Adaptive Experimentation at Scale: A Computational Framework for Flexible Batches},author={Che, Ethan and Namkoong, Hongseok},year={2023},journal={arXiv:2303.11582 [cs.LG]},note={Major revision in Operations Research},url={https://arxiv.org/abs/2303.11582},}
Recent advances in AI present significant opportunities to
rethink the design of service systems with AI at the
forefront. Even in the era of LLMs, managing a
workforce of human agents (“servers”) is a crit-
ical problem. Crowdsourcing workers are vital for
aligning LLMs with human values (e.g., ChatGPT) and
in many domains, the cost of human annotation is a
binding constraint (e.g., medical diagnosis from
radiologists). This work models and analyzes modern
service systems involving human reviewers and
state-of-the-art AI models. A key intellectual
challenge in managing con- gestion within such
service systems is endogeneity. Prediction is never
the goal, and the link between predictive
performance and downstream decision-making
performance is not straightforward due to
endogeneity. Our work crystallizes how classical
tools from queueing theory provide managerial
insights into the design of AI-based service
systems.
@article{CaiNaYa23,title={Diagnosing Model Performance Under Distribution Shift},author={Cai, Tiffany and Namkoong, Hongseok and Yadlowsky, Steve},year={2023},journal={arXiv:2303.02011 [stat.ML]},note={Second round review in Operations Research; Conference version appeared Symposium on Foundations of Responsible Computing 2023},url={https://arxiv.org/abs/2303.02011},}
@article{BoyarskyNaPo23,title={Modeling Interference via Experiment Rollout},author={Boyarsky, Ari and Namkoong, Hongseok and Pouget-Abadie, Jean},year={2023},journal={arXiv:2305.10728 [stat.ME]},note={Conference version appeared in ACM conference on Economics and Computation},url={https://arxiv.org/abs/2305.10728},}
@article{NamkoongDaBa24,title={Distilled Thompson Sampling: Practical and Efficient Thompson Sampling via Imitation Learning},author={Namkoong$*$, Hongseok and Daulton$*$, Samuel and Bakshy, Eytan},journal={Major revision in Manufacturing \& Service Operations Management},year={2024},note={Selected for an oral presentation at the Neurips 2020 OfflineRL Workshop},url={https://arxiv.org/abs/2011.14266},}
@article{JeongNa22,title={Assessing External Validity via Worst-case Subpopulation Treatment Effects},author={Jeong, Sookyo and Namkoong, Hongseok},journal={arXiv:2007.02411 [stat.ML]},year={2022},note={Short version appeared in Conference on Learning Theory 2020},url={https://arxiv.org/abs/2007.02411},}
@inproceedings{WortsmanIlGaRoGoMoNaFaCaKoSc22,title={Model Soups: Averaging Weights of Multiple Fine-tuned Models Improves Accuracy Without Increasing Inference Time},author={Wortsman, Mitchell and Ilharco, Gabriel and Gadre, Samir Yitzhak and Roelofs, Rebecca and Gontijo-Lopes, Raphael and Morcos, Ari S and Namkoong, Hongseok and Farhadi, Ali and Carmon, Yair and Kornblith, Simon and Schmidt, Ludwig},booktitle={Proceedings of the 39th International Conference on Machine Learning},year={2022},url={https://proceedings.mlr.press/v162/wortsman22a/wortsman22a.pdf},}
@inproceedings{WortsmanIlLiKiHaFaNaSc22,title={Robust Fine-tuning of Zero-shot Models},author={Wortsman$*$, Mitchell and Ilharco$*$, Gabriel and Kim, Jong Wook and Li, Mike and Kornblith, Simon and Roelofs, Rebecca and Gontijo-Lopes, Raphael and Hajishirzi, Hannaneh and Farhadi, Ali and Namkoong, Hongseok and Schmidt, Ludwig},booktitle={Proceedings of the 32nd IEEE Conference on Computer Vision and Pattern Recognition},note={CVPR Best Paper Finalist},year={2022},url={https://openaccess.thecvf.com/content/CVPR2022/papers/Wortsman_Robust_Fine-Tuning_of_Zero-Shot_Models_CVPR_2022_paper.pdf},}
@article{YadlowskyNaBaDuTi22,title={Bounds on the Conditional and Average Treatment Effect
with Unobserved Confounding Factors},author={Yadlowsky, Steve and Namkoong, Hongseok and Basu, Sanjay and Duchi, John and Tian, Lu},journal={Annals of Statistics},volume={50},number={5},pages={2587--2615},year={2022},url={https://projecteuclid.org/journals/annals-of-statistics/volume-50/issue-5/Bounds-on-the-conditional-and-average-treatment-effect-with-unobserved/10.1214/22-AOS2195.full},slide={YadlowskyNaBaDuTi22-slides.pdf}}
@inproceedings{NamkoongKeYaBr20,title={Off-policy Policy Evaluation For Sequential Decisions Under Unobserved Confounding},author={Namkoong$*$, Hongseok and Keramati$*$, Ramtin and Yadlowsky$*$, Steve and Brunskill, Emma},booktitle={Advances in Neural Information Processing Systems 33},year={2020},url={https://proceedings.neurips.cc/paper/2020/file/da21bae82c02d1e2b8168d57cd3fbab7-Paper.pdf},slide={YadlowskyNaBaDuTi22-slides.pdf}}
@article{DuchiHaNa22,author={Duchi, John C. and Hashimoto, Tatsunori and Namkoong, Hongseok},title={Distributionally Robust Losses Against Mixture Covariate Shifts},year={2022},journal={Operations Research},url={https://pubsonline.informs.org/doi/10.1287/opre.2022.2363},}
@article{DuchiNa21,author={Duchi, John C. and Namkoong, Hongseok},title={Learning Models with Uniform Performance via Distributionally
Robust Optimization},year={2021},volume={49},number={3},pages={1378-1406},journal={Annals of Statistics},url={https://projecteuclid.org/journals/annals-of-statistics/volume-49/issue-3/Learning-models-with-uniform-performance-via-distributionally-robust-optimization/10.1214/20-AOS2004.full},}
@article{DuchiGlNa21,title={Statistics of Robust Optimization: A Generalized Empirical
Likelihood Approach},author={Duchi, John C. and Glynn, Peter W. and Namkoong, Hongseok},year={2021},volume={46},number={3},pages={946-969},journal={Mathematics of Operations Research},note={APS Best Student Paper Prize},url={https://pubsonline.informs.org/doi/10.1287/moor.2020.1085},}
@inproceedings{SinhaNaVoDu18,title={Certifiable Distributional Robustness with Principled Adversarial Training},author={Sinha$*$, Aman and Namkoong$*$, Hongseok and Volpi, Riccardo and Duchi, John},booktitle={International Conference on Learning Representations},year={2018},note={Selected for a full oral presentation; 2\% of submissions},url={https://arxiv.org/abs/1710.10571},}
@article{DuchiNa19,title={Variance-based regularization with convex objectives},author={Duchi, John C. and Namkoong, Hongseok},year={2019},journal={Journal of Machine Learning Research},note={Conference version won NeurIPS 2017 Best Paper Award},url={https://jmlr.csail.mit.edu/papers/volume20/17-750/17-750.pdf},slide={NamkoongDu17-slides.pdf},}
@inproceedings{VolpiNaSeDuMuSa18,title={Generalizing to Unseen Domains via Adversarial Data Augmentation},author={Volpi$*$, Riccardo and Namkoong$*$, Hongseok and Duchi, John and Murino, Vittorio and Savarese, Silvio},booktitle={Advances in Neural Information Processing Systems 31},year={2018},url={https://proceedings.neurips.cc/paper_files/paper/2018/file/1d94108e907bb8311d8802b48fd54b4a-Paper.pdf},}
@inproceedings{OKellySiNaDuTe18,title={Scalable End-to-End Autonomous Vehicle Testing via Rare-event Simulation},author={O'Kelly$*$, Mathew and Sinha$*$, Aman and Namkoong$*$, Hongseok and Duchi, John and Tedrake, Russ},booktitle={Advances in Neural Information Processing Systems 31},year={2018},url={https://proceedings.neurips.cc/paper_files/paper/2018/file/653c579e3f9ba5c03f2f2f8cf4512b39-Paper.pdf},}
@inproceedings{HashimotoSrNaLi18,title={Fairness Without Demographics in Repeated Loss Minimization},author={Hashimoto, Tatsunori and Srivastava, Megha and Namkoong, Hongseok and Liang, Percy},booktitle={International Conference on Machine Learning},year={2018},note={Best Paper Runner-up Award},url={https://proceedings.mlr.press/v80/hashimoto18a/hashimoto18a.pdf},}
@inproceedings{NamkoongSiYaDu17,title={Adaptive sampling probabilities for non-smooth optimization},author={Namkoong, Hongseok and Sinha, Aman and Yadlowsky, Steve and Duchi, John C},booktitle={International Conference on Machine Learning},pages={2574--2583},year={2017},url={https://proceedings.mlr.press/v70/namkoong17a/namkoong17a.pdf},}
@inproceedings{NamkoongDu16,author={Namkoong, Hongseok and Duchi, John C.},title={Stochastic Gradient Methods for Distributionally
Robust Optimization with $f$-divergences},year={2016},booktitle={Advances in Neural Information Processing Systems 29},url={https://papers.nips.cc/paper_files/paper/2016/hash/4588e674d3f0faf985047d4c3f13ed0d-Abstract.html},slide={NamkoongDu16-slides.pdf}}