( = Paper PDF,
= Presentation slides,
= Presentation video)
Hao Li; Cor-Paul Bezemer
Bridging the language gap: an empirical study of bindings for open source machine learning libraries across software package ecosystems Journal Article
Empirical Software Engineering, 30 (6), 2024.
Abstract | BibTeX | Tags: Library bindings, Machine learning, SE4AI, SE4ML
@article{li_MLbindings,
title = {Bridging the language gap: an empirical study of bindings for open source machine learning libraries across software package ecosystems},
author = {Hao Li and Cor-Paul Bezemer},
year = {2024},
date = {2024-10-18},
urldate = {2024-10-18},
journal = {Empirical Software Engineering},
volume = {30},
number = {6},
abstract = {Open source machine learning (ML) libraries enable developers to
integrate advanced ML functionality into their own applications. However,
popular ML libraries, such as TensorFlow, are not available natively in all
programming languages and software package ecosystems. Hence, developers
who wish to use an ML library which is not available in their programming lan-
guage or ecosystem of choice, may need to resort to using a so-called binding
library (or binding). Bindings provide support across programming languages
and package ecosystems for reusing a host library. For example, the Keras
.NET binding provides support for the Keras library in the NuGet (.NET)
ecosystem even though the Keras library was written in Python. In this pa-
per, we collect 2,436 cross-ecosystem bindings for 546 ML libraries across 13
software package ecosystems by using an approach called BindFind, which can
automatically identify bindings and link them to their host libraries. Further-
more, we conduct an in-depth study of 133 cross-ecosystem bindings and their
development for 40 popular open source ML libraries. Our findings reveal that
the majority of ML library bindings are maintained by the community, with
npm being the most popular ecosystem for these bindings. Our study also
indicates that most bindings cover only a limited range of the host library’s
releases, often experience considerable delays in supporting new releases, and
have widespread technical lag. Our findings highlight key factors to consider
for developers integrating bindings for ML libraries and open avenues for re-
searchers to further investigate bindings in software package ecosystems.},
keywords = {Library bindings, Machine learning, SE4AI, SE4ML},
pubstate = {published},
tppubtype = {article}
}
integrate advanced ML functionality into their own applications. However,
popular ML libraries, such as TensorFlow, are not available natively in all
programming languages and software package ecosystems. Hence, developers
who wish to use an ML library which is not available in their programming lan-
guage or ecosystem of choice, may need to resort to using a so-called binding
library (or binding). Bindings provide support across programming languages
and package ecosystems for reusing a host library. For example, the Keras
.NET binding provides support for the Keras library in the NuGet (.NET)
ecosystem even though the Keras library was written in Python. In this pa-
per, we collect 2,436 cross-ecosystem bindings for 546 ML libraries across 13
software package ecosystems by using an approach called BindFind, which can
automatically identify bindings and link them to their host libraries. Further-
more, we conduct an in-depth study of 133 cross-ecosystem bindings and their
development for 40 popular open source ML libraries. Our findings reveal that
the majority of ML library bindings are maintained by the community, with
npm being the most popular ecosystem for these bindings. Our study also
indicates that most bindings cover only a limited range of the host library’s
releases, often experience considerable delays in supporting new releases, and
have widespread technical lag. Our findings highlight key factors to consider
for developers integrating bindings for ML libraries and open avenues for re-
searchers to further investigate bindings in software package ecosystems.
Mohammad Reza Taesiri
Leveraging Foundation Models for Video Game Quality Assurance PhD Thesis
2024.
Abstract | BibTeX | Tags: Computer games, Computer vision, Game development, Game testing, Gameplay videos, Machine learning, Software quality
@phdthesis{phd_taesiri,
title = {Leveraging Foundation Models for Video Game Quality Assurance},
author = {Mohammad Reza Taesiri },
year = {2024},
date = {2024-09-25},
abstract = {The video game industry has become a powerhouse in the global entertainment econ-
omy. Creating engaging, high-quality games demands intricate development processes
and significant resources. As projects grow in complexity and scale, developers often
grapple with demanding schedules, tight deadlines, and the risk of burnout. These
pressures highlight the need for more efficient development strategies, with quality
assurance (QA) emerging as a critical area for optimization.
Artificial Intelligence (AI) has the potential to address these challenges by en-
hancing the game QA processes in large gaming companies. Specifically, foundation
models—large pre-trained AI models—offer promising applications to improve these
processes. Exploring novel uses of these advanced AI models could reveal their poten-
tial and limitations in optimizing game development workflows, potentially alleviating
some of the industry’s pressing issues and facilitating the creation of high-quality, en-
gaging games.
In this thesis, my goal is to improve video game testing processes by leveraging
foundation models to ensure the final product reaches a desirable quality. I explore
new opportunities that foundation models bring to game testing, from searching for
instances of game bugs within video repositories to assisting human testers in catching
bugs, through three studies:
First, I investigate the utility of image-text foundation models in retrieving game-
play videos. In this study, I create a video search engine designed to help developers
efficiently search video repositories for examples of video game bugs using textual
descriptions. For example, developers can find all instances of a bug by using a tex-
tual description of the bug, such as a horse flying in the air. This study lays the
groundwork for AI-based game QA processes, with results demonstrating significant
potential.
Next, I introduce GlitchBench, a benchmarking dataset of video game glitches
and anomalies designed to assess state-of-the-art large multimodal models, such as
GPT-4V, in detecting and understanding game bugs. This extensive dataset includes
a wide range of images depicting various glitches, sourced from both online platforms
and synthetic sets created within the Unity game engine. GlitchBench includes both
common and rare glitches encountered in the video game quality assurance process.
The findings from this study highlight both the promise and limitations of existing
models, particularly in unusual and rare cases.
Lastly, I introduce VideoGameBunny, a large multimodal model specifically
trained for video game content, accompanied by a dataset of 389,565 image-instruction
pairs. My analysis demonstrates that VideoGameBunny outperforms much larger
models in video game understanding tasks while using 4.2× fewer parameters. This
result underscores the effectiveness and promise of using a high-quality dataset to
improve models’ understanding of video games, thus making them more effective in
the game QA process.
Future work should focus on enhancing the generalization and robustness of AI
models in the gaming context, particularly through better integration of vision and
language components. This integration could be achieved using either early or late fu-
sion methods. For late fusion methods, where two pre-trained models are connected,
better alignment between these components can be achieved through improved train-
ing data and strategies. Alternatively, early fusion techniques, which involve training
both components simultaneously to enhance their integration, can overcome many
issues that existing models have.},
keywords = {Computer games, Computer vision, Game development, Game testing, Gameplay videos, Machine learning, Software quality},
pubstate = {published},
tppubtype = {phdthesis}
}
omy. Creating engaging, high-quality games demands intricate development processes
and significant resources. As projects grow in complexity and scale, developers often
grapple with demanding schedules, tight deadlines, and the risk of burnout. These
pressures highlight the need for more efficient development strategies, with quality
assurance (QA) emerging as a critical area for optimization.
Artificial Intelligence (AI) has the potential to address these challenges by en-
hancing the game QA processes in large gaming companies. Specifically, foundation
models—large pre-trained AI models—offer promising applications to improve these
processes. Exploring novel uses of these advanced AI models could reveal their poten-
tial and limitations in optimizing game development workflows, potentially alleviating
some of the industry’s pressing issues and facilitating the creation of high-quality, en-
gaging games.
In this thesis, my goal is to improve video game testing processes by leveraging
foundation models to ensure the final product reaches a desirable quality. I explore
new opportunities that foundation models bring to game testing, from searching for
instances of game bugs within video repositories to assisting human testers in catching
bugs, through three studies:
First, I investigate the utility of image-text foundation models in retrieving game-
play videos. In this study, I create a video search engine designed to help developers
efficiently search video repositories for examples of video game bugs using textual
descriptions. For example, developers can find all instances of a bug by using a tex-
tual description of the bug, such as a horse flying in the air. This study lays the
groundwork for AI-based game QA processes, with results demonstrating significant
potential.
Next, I introduce GlitchBench, a benchmarking dataset of video game glitches
and anomalies designed to assess state-of-the-art large multimodal models, such as
GPT-4V, in detecting and understanding game bugs. This extensive dataset includes
a wide range of images depicting various glitches, sourced from both online platforms
and synthetic sets created within the Unity game engine. GlitchBench includes both
common and rare glitches encountered in the video game quality assurance process.
The findings from this study highlight both the promise and limitations of existing
models, particularly in unusual and rare cases.
Lastly, I introduce VideoGameBunny, a large multimodal model specifically
trained for video game content, accompanied by a dataset of 389,565 image-instruction
pairs. My analysis demonstrates that VideoGameBunny outperforms much larger
models in video game understanding tasks while using 4.2× fewer parameters. This
result underscores the effectiveness and promise of using a high-quality dataset to
improve models’ understanding of video games, thus making them more effective in
the game QA process.
Future work should focus on enhancing the generalization and robustness of AI
models in the gaming context, particularly through better integration of vision and
language components. This integration could be achieved using either early or late fu-
sion methods. For late fusion methods, where two pre-trained models are connected,
better alignment between these components can be achieved through improved train-
ing data and strategies. Alternatively, early fusion techniques, which involve training
both components simultaneously to enhance their integration, can overcome many
issues that existing models have.
Hao Li
Investigating the Quality of Bindings for Machine Learning Libraries in Software Package Ecosystems PhD Thesis
2024.
Abstract | BibTeX | Tags: Machine learning, Software Ecosystem, Software quality
@phdthesis{phd_haoli,
title = {Investigating the Quality of Bindings for Machine Learning Libraries in Software Package Ecosystems},
author = {Hao Li },
year = {2024},
date = {2024-08-21},
urldate = {2024-08-21},
abstract = {Machine learning (ML) has revolutionized many domains, with developers often re-
lying on open source ML libraries to integrate ML capabilities into their projects.
However, these libraries primarily support a single programming language, limiting
their availability for projects in other languages. Bindings serve as bridges between
programming languages by providing interfaces to ML libraries. This thesis investi-
gates the quality of bindings for ML libraries in software package ecosystems, focusing
on their maintenance and software quality.
The first study presented in this thesis introduces BindFind, an automated ap-
proach to identify bindings and link them with their corresponding host libraries
across various software package ecosystems. By analyzing 2,436 bindings for 546 ML
libraries, we find that most bindings are community-maintained, with npm being the
most popular choice for publishing these bindings. The analysis reveals that these
bindings usually cover a limited range of releases from their host library and experi-
ence significant delays in supporting new releases.
In the second study, we investigate the usage and rationale behind release-level
deprecation in bindings for ML libraries within the Cargo and npm ecosystems. We
discover that bindings in Cargo have a higher percentage of deprecated releases com-
pared to general packages, while the percentages of deprecated releases and general
packages are similar in npm. The primary reasons for deprecation are package re-
moval or replacement and defects in both ecosystems. We also identify the issue of
implicitly deprecated releases in Cargo due to deprecation propagation through the
dependency network.
The third study evaluates the impact of using different bindings on the software
quality of ML systems through experiments on model training and inference using
TensorFlow and PyTorch across four programming languages. The results show that
models trained with one binding perform consistently in inference tasks when utilized
with another binding. Furthermore, non-default bindings can outperform the default
Python bindings in specific tasks without sacrificing accuracy. We also find significant
differences in inference times across bindings, highlighting the benefits of choosing ap-
propriate bindings based on specific performance requirements to maximize efficiency
in ML projects.
The work presented in this thesis provides deep insights, actionable recommenda-
tions, and effective and thoroughly evaluated approaches for assessing and improving
the quality of bindings for ML libraries in software package ecosystems.},
keywords = {Machine learning, Software Ecosystem, Software quality},
pubstate = {published},
tppubtype = {phdthesis}
}
lying on open source ML libraries to integrate ML capabilities into their projects.
However, these libraries primarily support a single programming language, limiting
their availability for projects in other languages. Bindings serve as bridges between
programming languages by providing interfaces to ML libraries. This thesis investi-
gates the quality of bindings for ML libraries in software package ecosystems, focusing
on their maintenance and software quality.
The first study presented in this thesis introduces BindFind, an automated ap-
proach to identify bindings and link them with their corresponding host libraries
across various software package ecosystems. By analyzing 2,436 bindings for 546 ML
libraries, we find that most bindings are community-maintained, with npm being the
most popular choice for publishing these bindings. The analysis reveals that these
bindings usually cover a limited range of releases from their host library and experi-
ence significant delays in supporting new releases.
In the second study, we investigate the usage and rationale behind release-level
deprecation in bindings for ML libraries within the Cargo and npm ecosystems. We
discover that bindings in Cargo have a higher percentage of deprecated releases com-
pared to general packages, while the percentages of deprecated releases and general
packages are similar in npm. The primary reasons for deprecation are package re-
moval or replacement and defects in both ecosystems. We also identify the issue of
implicitly deprecated releases in Cargo due to deprecation propagation through the
dependency network.
The third study evaluates the impact of using different bindings on the software
quality of ML systems through experiments on model training and inference using
TensorFlow and PyTorch across four programming languages. The results show that
models trained with one binding perform consistently in inference tasks when utilized
with another binding. Furthermore, non-default bindings can outperform the default
Python bindings in specific tasks without sacrificing accuracy. We also find significant
differences in inference times across bindings, highlighting the benefits of choosing ap-
propriate bindings based on specific performance requirements to maximize efficiency
in ML projects.
The work presented in this thesis provides deep insights, actionable recommenda-
tions, and effective and thoroughly evaluated approaches for assessing and improving
the quality of bindings for ML libraries in software package ecosystems.
Hao Li; Gopi Krishnan Rajbahadur; Cor-Paul Bezemer
Studying the Impact of TensorFlow and PyTorch Bindings on Machine Learning Software Quality Journal Article
ACM Transactions on Software Engineering and Methodology, 2024.
Abstract | BibTeX | Tags: Library bindings, Machine learning, SE4AI, SE4ML, Software quality
@article{Li_BindingsQuality,
title = {Studying the Impact of TensorFlow and PyTorch Bindings on Machine Learning Software Quality},
author = {Hao Li and Gopi Krishnan Rajbahadur and Cor-Paul Bezemer},
year = {2024},
date = {2024-07-07},
journal = {ACM Transactions on Software Engineering and Methodology},
abstract = {Bindings for machine learning frameworks (such as TensorFlow and PyTorch) allow developers to integrate a framework’s functionality using a programming language different from the framework’s default language (usually Python). In this paper, we study the impact of using TensorFlow and PyTorch bindings in C#, Rust, Python and JavaScript on the software quality in terms of correctness (training and test accuracy) and time cost (training and inference time) when training and performing inference on five widely used deep learning models. Our experiments show that a model can be trained in one binding and used for inference in another binding for the same framework without losing accuracy. Our study is the first to show that using a non-default binding can help improve machine learning software quality from the time cost perspective compared to the default Python binding while still achieving the same level of correctness.},
keywords = {Library bindings, Machine learning, SE4AI, SE4ML, Software quality},
pubstate = {published},
tppubtype = {article}
}
Hao Li; Gopi Krishnan Rajbahadur; Dayi Lin; Cor-Paul Bezemer; Zhen Ming (Jack) Jiang
Keeping Deep Learning Models in Check: A History-Based Approach to Mitigate Overfitting Journal Article
IEEE Access, 12 , pp. 70676–70689, 2024.
Abstract | BibTeX | Tags: Machine learning, Overfitting
@article{Li_Overfitting,
title = {Keeping Deep Learning Models in Check: A History-Based Approach to Mitigate Overfitting},
author = {Hao Li and Gopi Krishnan Rajbahadur and Dayi Lin and Cor-Paul Bezemer and Zhen Ming (Jack) Jiang},
year = {2024},
date = {2024-05-17},
journal = {IEEE Access},
volume = {12},
pages = {70676--70689},
abstract = {In software engineering, deep learning models are increasingly deployed for critical tasks such as bug detection and code review. However, overfitting remains a challenge that affects the quality, reliability, and trustworthiness of software systems that utilize deep learning models. Overfitting can be (1) prevented (e.g., using dropout or early stopping) or (2) detected in a trained model (e.g., using correlation-based approaches). Both overfitting detection and prevention approaches that are currently used have constraints (e.g., requiring modification of the model structure, and high computing resources). In this paper, we propose a simple, yet powerful approach that can both detect and prevent overfitting based on the training history (i.e., validation losses). Our approach first trains a time series classifier on training histories of overfit models. This classifier is then used to detect if a trained model is overfit. In addition, our trained classifier can be used to prevent overfitting by identifying the optimal point to stop a model’s training. We evaluate our approach on its ability to identify and prevent overfitting in real-world samples. We compare our approach against correlation-based detection approaches and the most commonly used prevention approach (i.e., early stopping). Our approach achieves an F1 score of 0.91 which is at least 5% higher than the current best-performing non-intrusive overfitting detection approach. Furthermore, our approach can stop training to avoid overfitting at least 32% of the times earlier than early stopping and has the same or a better rate of returning the best model.},
keywords = {Machine learning, Overfitting},
pubstate = {published},
tppubtype = {article}
}
Mikael Sabuhi; Petr Musilek; Cor-Paul Bezemer
Micro-FL: A Fault-Tolerant Scalable Microservice Based Platform for Federated Learning Journal Article
Future Internet, 16 (3), pp. 1-19, 2024.
Abstract | BibTeX | Tags: Federated learning, Machine learning, Microservices
@article{Sabuhi_MicroFL,
title = {Micro-FL: A Fault-Tolerant Scalable Microservice Based Platform for Federated Learning},
author = {Mikael Sabuhi and Petr Musilek and Cor-Paul Bezemer },
year = {2024},
date = {2024-02-19},
journal = {Future Internet},
volume = {16},
number = {3},
pages = {1-19},
abstract = {As the number of machine learning applications increases, growing concerns about data privacy expose the limitations of traditional cloud-based machine learning methods that rely on centralized data collection and processing. Federated learning emerges as a promising alternative, offering a novel approach to training machine learning models that safeguards data privacy. Federated learning facilitates collaborative model training across various entities. In this approach, each user trains models locally and shares only the local model parameters with a central server, which then generates a global model based on these individual updates. This approach ensures data privacy since the training data itself is never directly shared with a central entity. However, existing federated machine learning frameworks are not without challenges. In terms of server design, these frameworks exhibit limited scalability with an increasing number of clients and are highly vulnerable to system faults, particularly as the central server becomes a single point of failure. This paper introduces Micro-FL, a federated learning framework that uses a microservices architecture to implement the federated learning system. It demonstrates that the framework is fault-tolerant and scalable, showing its ability to handle an increasing number of clients. A comprehensive performance evaluation confirms that Micro-FL proficiently handles component faults, enabling a smooth and uninterrupted operation.},
keywords = {Federated learning, Machine learning, Microservices},
pubstate = {published},
tppubtype = {article}
}
Mohammad Reza Taesiri; Giang Nguyen; Sarra Habchi; Cor-Paul Bezemer; Anh Nguyen
ImageNet-Hard: The Hardest Images Remaining from a Study of the Power of Zoom and Spatial Biases in Image Classification Inproceedings
NeurIPS Dataset and Benchmark track, 2023.
BibTeX | Tags: Benchmark, Computer vision, Dataset, Image classification, Machine learning
@inproceedings{TaesiriNeurIPS2023,
title = {ImageNet-Hard: The Hardest Images Remaining from a Study of the Power of Zoom and Spatial Biases in Image Classification},
author = {Mohammad Reza Taesiri and Giang Nguyen and Sarra Habchi and Cor-Paul Bezemer and Anh Nguyen},
year = {2023},
date = {2023-12-07},
urldate = {2023-12-07},
booktitle = {NeurIPS Dataset and Benchmark track},
keywords = {Benchmark, Computer vision, Dataset, Image classification, Machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}