Few-shot temporal action localization (TAL) methods that adapt large models via single-prompt tuning often fail to produce precise temporal boundaries. This is because a single prompt tends to learn a non-discriminative mean representation from sparse data, limiting generalization. We propose PLOT-TAL, a multi-prompt ensemble framework that encourages each prompt to specialize on compositional sub-events of an action. To enforce this specialization, we leverage Optimal Transport (OT) to find globally optimal alignments between the prompt ensemble and a video’s temporal features. Our approach eliminates the need for complex meta-learning while achieving state-of-the-art results on THUMOS’14 and EPIC-Kitchens. The significant improvements, particularly at higher IoU thresholds, validate that learning distributed, compositional representations leads to more precise temporal localization in few-shot settings.
@inproceedings{Fish2025PLOTTAL,
author = {Edward Fish and Andrew Gilbert},
title = {PLOT-TAL: Prompt Learning with Optimal Transport for Few-Shot Temporal Action Localization},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
year = {2025},
series = {Closing the Loop between Vision and Language, 6th Workshop},
publisher = {IEEE},
url = {https://andrewjohngilbert.github.io/PLOT-TAL/}
}