publications
2022
- AuxMix: Semi-Supervised Learning With Unconstrained Unlabeled DataAmin Banitalebi-Dehkordi, Pratik Gujjar, and Yong ZhangIn Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops Jun 2022
Semi-supervised learning (SSL) has seen great strides when labeled data is scarce but unlabeled data is abundant. Critically, most recent work assume that such unlabeled data is drawn from the same distribution as the labeled data. In this work, we show that state-of-the-art SSL algorithms suffer a degradation in performance in the presence of unlabeled auxiliary data that does not necessarily possess the same class distribution as the labeled set. We term this problem as Auxiliary-SSL and propose AuxMix, an algorithm that leverages self-supervised learning tasks to learn generic features in order to mask auxiliary data that are not semantically similar to the labeled set. We also propose to regularize learning by maximizing the predicted entropy for dissimilar auxiliary samples. We show an improvement of 5% over existing baselines on a ResNet-50 model when trained on CIFAR10 dataset with 4k labeled samples and all unlabeled data is drawn from the Tiny-Imagenet dataset. We report competitive results on several datasets and conduct ablation studies.
@inproceedings{auxmix, author = {Banitalebi-Dehkordi, Amin and Gujjar, Pratik and Zhang, Yong}, title = {AuxMix: Semi-Supervised Learning With Unconstrained Unlabeled Data}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, month = jun, year = {2022}, pages = {3999-4006}, }
2019
- Classifying Pedestrian Actions In Advance Using Predicted Video Of Urban Driving ScenesPratik Gujjar, and Richard VaughanIn 2019 International Conference on Robotics and Automation (ICRA) Jun 2019
We explore prediction of urban pedestrian actions by generating a video future of the traffic scene, and show promising results in classifying pedestrian behaviour before it is observed. We compare several encoder-decoder network models that predict 16 frames (400-600 milliseconds of video) from the preceding 16 frames. Our main contribution is a method for learning a sequence of representations to iteratively transform features learnt from the input to the future. Then we use a binary action classifier network for determining a pedestrian’s crossing intent from predicted video. Our results show an average precision of 81%, significantly higher than previous methods. The model with the best classification performance runs for 117 ms on commodity GPU, giving an effective look- ahead of 416 ms.
@inproceedings{sfuthesispaper, author = {Gujjar, Pratik and Vaughan, Richard}, booktitle = {2019 International Conference on Robotics and Automation (ICRA)}, title = {Classifying Pedestrian Actions In Advance Using Predicted Video Of Urban Driving Scenes}, year = {2019}, volume = {}, number = {}, pages = {2097-2103}, doi = {10.1109/ICRA.2019.8794278}, }
2018
- Reasoning About Pedestrian Intent by Future Video PredictionPratik GujjarNov 2018
Automated vehicles must react very quickly to pedestrians for safety. We explore analysis and prediction of pedestrian movements around urban roads by generating a video future of the traffic scene, and show promising results in classifying pedestrian behaviour before it is observed. Our first method consists of a decoding algorithm that is autoregressive of repre- sentations that an encoder learns from input video. We compare many neural network based encoder-decoder models to predict 16 frames (400-600 milliseconds) of video. We present the contributions of time-dilated causal convolutions and additive residual connections in our recurrent decoding algorithm. Furthermore, we show that these connections encourage representations at various decoding stages to be mutually different. Our main contribution is learning a sequence of representations to iteratively transform features learnt from the input to the future. Our second method presents a binary action classifier network for de- termining a pedestrian’s crossing intent from videos predicted by our first method. Our results show an average precision of 81%, significantly higher than previous methods. Our best model in terms of classification performance has a run time of 117 ms on a commodity GPU with an effective look-ahead of 416 ms.
@phdthesis{citekey, author = {Gujjar, Pratik}, title = {Reasoning About Pedestrian Intent by Future Video Prediction}, school = {Simon Fraser University}, year = {2018}, type = {}, address = {}, month = nov, note = {}, annote = {}, }