( = Paper PDF,
= Presentation slides,
= Presentation video)
Lizhi Liao; Simon Eismann; Heng Li; Cor-Paul Bezemer; Diego Costa; André van Hoorn; Weiyi Shang
Early Detection of Performance Regressions by Bridging Local Performance Data and Architectural Models Inproceedings
International Conference on Software Engineering (ICSE), 2025.
BibTeX | Tags: Performance, Performance analysis, Performance engineering, Performance evaluation, Performance regressions, Performance testing
@inproceedings{Liao_ICSE2025,
title = {Early Detection of Performance Regressions by Bridging Local Performance Data and Architectural Models},
author = {Lizhi Liao and Simon Eismann and Heng Li and Cor-Paul Bezemer and Diego Costa and André van Hoorn and Weiyi Shang},
year = {2025},
date = {2025-08-15},
urldate = {2025-08-15},
booktitle = {International Conference on Software Engineering (ICSE)},
keywords = {Performance, Performance analysis, Performance engineering, Performance evaluation, Performance regressions, Performance testing},
pubstate = {published},
tppubtype = {inproceedings}
}
Hao Li; Cor-Paul Bezemer; Ahmed E. Hassan
Software Engineering and Foundation Models: Insights from Industry Blogs Using a Jury of Foundation Models Inproceedings
International Conference on Software Engineering - Software Engineering in Practice (ICSE - SEIP) Track, 2025.
Abstract | BibTeX | Tags: FM4SE, Foundation models, SE4AI, SE4FM, SE4ML
@inproceedings{Li_SEFM_blogs,
title = {Software Engineering and Foundation Models: Insights from Industry Blogs Using a Jury of Foundation Models},
author = {Hao Li and Cor-Paul Bezemer and Ahmed E. Hassan},
year = {2025},
date = {2025-04-27},
booktitle = {International Conference on Software Engineering - Software Engineering in Practice (ICSE - SEIP) Track},
abstract = {Foundation models (FMs) such as large language
models (LLMs) have significantly impacted many fields, including
software engineering (SE). The interaction between SE and FMs
has led to the integration of FMs into SE practices (FM4SE)
and the application of SE methodologies to FMs (SE4FM). While
several literature surveys exist on academic contributions to these
trends, we are the first to provide a practitioner’s view. We
analyze 155 FM4SE and 997 SE4FM blog posts from leading
technology companies, leveraging an FM-powered surveying
approach to systematically label and summarize the discussed
activities and tasks. We observed that while code generation is the
most prominent FM4SE task, FMs are leveraged for many other
SE activities such as code understanding, summarization, and
API recommendation. The majority of blog posts on SE4FM are
about model deployment & operation, and system architecture
& orchestration. Although the emphasis is on cloud deployments,
there is a growing interest in compressing FMs and deploying
them on smaller devices such as edge or mobile devices. We
outline eight future research directions inspired by our gained
insights, aiming to bridge the gap between academic findings
and real-world applications. Our study not only enriches the
body of knowledge on practical applications of FM4SE and
SE4FM but also demonstrates the utility of FMs as a powerful
and efficient approach in conducting literature surveys within
technical and grey literature domains. Our dataset, results, code
and used prompts can be found in our online replication package
at https://zenodo.org/records/14563992.},
keywords = {FM4SE, Foundation models, SE4AI, SE4FM, SE4ML},
pubstate = {published},
tppubtype = {inproceedings}
}
models (LLMs) have significantly impacted many fields, including
software engineering (SE). The interaction between SE and FMs
has led to the integration of FMs into SE practices (FM4SE)
and the application of SE methodologies to FMs (SE4FM). While
several literature surveys exist on academic contributions to these
trends, we are the first to provide a practitioner’s view. We
analyze 155 FM4SE and 997 SE4FM blog posts from leading
technology companies, leveraging an FM-powered surveying
approach to systematically label and summarize the discussed
activities and tasks. We observed that while code generation is the
most prominent FM4SE task, FMs are leveraged for many other
SE activities such as code understanding, summarization, and
API recommendation. The majority of blog posts on SE4FM are
about model deployment & operation, and system architecture
& orchestration. Although the emphasis is on cloud deployments,
there is a growing interest in compressing FMs and deploying
them on smaller devices such as edge or mobile devices. We
outline eight future research directions inspired by our gained
insights, aiming to bridge the gap between academic findings
and real-world applications. Our study not only enriches the
body of knowledge on practical applications of FM4SE and
SE4FM but also demonstrates the utility of FMs as a powerful
and efficient approach in conducting literature surveys within
technical and grey literature domains. Our dataset, results, code
and used prompts can be found in our online replication package
at https://zenodo.org/records/14563992.
Tajkia Rahman Toma; Balreet Grewal; Cor-Paul Bezemer
Answering User Questions about Machine Learning Models through Standardized Model Cards Inproceedings
International Conference on Software Engineering (ICSE), 2025.
Abstract | BibTeX | Tags: Hugging Face, Q&A communities, Q&A websites, SE4AI, SE4FM, SE4ML
@inproceedings{Toma_UserQuestions,
title = {Answering User Questions about Machine Learning Models through Standardized Model Cards},
author = {Tajkia Rahman Toma and Balreet Grewal and Cor-Paul Bezemer },
year = {2025},
date = {2025-04-27},
booktitle = {International Conference on Software Engineering (ICSE)},
abstract = {Reusing pre-trained machine learning models is
becoming very popular due to model hubs such as Hugging Face
(HF). However, similar to when reusing software, many issues
may arise when reusing an ML model. In many cases, users
resort to asking questions on discussion forums such as the HF
community forum. In this paper, we study how we can reduce the
community’s workload in answering these questions and increase
the likelihood that questions receive a quick answer. We analyze
11,278 discussions from the HF model community that contain
user questions about ML models. We focus on the effort spent
handling questions, the high-level topics of discussions, and the
potential for standardizing responses in model cards based on
a model card template. Our findings indicate that there is not
much effort involved in responding to user questions, however,
40.1% of the questions remain open without any response. A
topic analysis shows that discussions are more centered around
technical details on model development and troubleshooting,
indicating that more input from model providers is required. We
show that 42.5% of the questions could have been answered if the
model provider followed a standard model card template for the
model card. Based on our analysis, we recommend that model
providers add more development-related details on the model’s
architecture, algorithm, data preprocessing and training code in
existing documentation (sub)sections and add new (sub)sections
to the template to address common questions about model usage
and hardware requirements.},
keywords = {Hugging Face, Q&A communities, Q&A websites, SE4AI, SE4FM, SE4ML},
pubstate = {published},
tppubtype = {inproceedings}
}
becoming very popular due to model hubs such as Hugging Face
(HF). However, similar to when reusing software, many issues
may arise when reusing an ML model. In many cases, users
resort to asking questions on discussion forums such as the HF
community forum. In this paper, we study how we can reduce the
community’s workload in answering these questions and increase
the likelihood that questions receive a quick answer. We analyze
11,278 discussions from the HF model community that contain
user questions about ML models. We focus on the effort spent
handling questions, the high-level topics of discussions, and the
potential for standardizing responses in model cards based on
a model card template. Our findings indicate that there is not
much effort involved in responding to user questions, however,
40.1% of the questions remain open without any response. A
topic analysis shows that discussions are more centered around
technical details on model development and troubleshooting,
indicating that more input from model providers is required. We
show that 42.5% of the questions could have been answered if the
model provider followed a standard model card template for the
model card. Based on our analysis, we recommend that model
providers add more development-related details on the model’s
architecture, algorithm, data preprocessing and training code in
existing documentation (sub)sections and add new (sub)sections
to the template to address common questions about model usage
and hardware requirements.
Mohammad Reza Taesiri; Cor-Paul Bezemer
VIDEOGAMEBUNNY: Towards vision assistants for video games Inproceedings
Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, 2025.
Abstract | BibTeX | Tags: Computer games, Foundation models, Game development, Game testing
@inproceedings{Taesiri_VideoGameBunny,
title = {VIDEOGAMEBUNNY: Towards vision assistants for video games},
author = {Mohammad Reza Taesiri and Cor-Paul Bezemer },
year = {2025},
date = {2025-03-01},
booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
abstract = {Large multimodal models (LMMs) hold substantial
promise across various domains, from personal assistance
in daily tasks to sophisticated applications like medical di-
agnostics. However, their capabilities have limitations in
the video game domain, such as challenges with scene un-
derstanding, hallucinations, and inaccurate descriptions of
video game content, especially in open-source models. This
paper describes the development of VIDEOGAMEBUNNY,
a LLaVA-style model based on Bunny, specifically tailored
for understanding images from video games. We release
intermediate checkpoints, training logs, and an extensive
dataset comprising 185,259 video game images from 413
titles, along with 389,565 image-instruction pairs that in-
clude image captions, question-answer pairs, and a JSON
representation of 16 elements of 136,974 images. Our ex-
periments show that our high quality game-related data
has the potential to make a relatively small model outper-
form the much larger state-of-the-art model LLaVa-1.6-34b
(which has more than 4x the number of parameters). Our
study paves the way for future research in video game un-
derstanding on tasks such as playing, commentary, and
debugging. Code and data are available at: https://videogamebunny.github.io/},
keywords = {Computer games, Foundation models, Game development, Game testing},
pubstate = {published},
tppubtype = {inproceedings}
}
promise across various domains, from personal assistance
in daily tasks to sophisticated applications like medical di-
agnostics. However, their capabilities have limitations in
the video game domain, such as challenges with scene un-
derstanding, hallucinations, and inaccurate descriptions of
video game content, especially in open-source models. This
paper describes the development of VIDEOGAMEBUNNY,
a LLaVA-style model based on Bunny, specifically tailored
for understanding images from video games. We release
intermediate checkpoints, training logs, and an extensive
dataset comprising 185,259 video game images from 413
titles, along with 389,565 image-instruction pairs that in-
clude image captions, question-answer pairs, and a JSON
representation of 16 elements of 136,974 images. Our ex-
periments show that our high quality game-related data
has the potential to make a relatively small model outper-
form the much larger state-of-the-art model LLaVa-1.6-34b
(which has more than 4x the number of parameters). Our
study paves the way for future research in video game un-
derstanding on tasks such as playing, commentary, and
debugging. Code and data are available at: https://videogamebunny.github.io/
Hao Li; Cor-Paul Bezemer
Bridging the language gap: an empirical study of bindings for open source machine learning libraries across software package ecosystems Journal Article
Empirical Software Engineering, 30 (6), 2024.
Abstract | BibTeX | Tags: Library bindings, Machine learning, SE4AI, SE4ML
@article{li_MLbindings,
title = {Bridging the language gap: an empirical study of bindings for open source machine learning libraries across software package ecosystems},
author = {Hao Li and Cor-Paul Bezemer},
year = {2024},
date = {2024-10-18},
urldate = {2024-10-18},
journal = {Empirical Software Engineering},
volume = {30},
number = {6},
abstract = {Open source machine learning (ML) libraries enable developers to
integrate advanced ML functionality into their own applications. However,
popular ML libraries, such as TensorFlow, are not available natively in all
programming languages and software package ecosystems. Hence, developers
who wish to use an ML library which is not available in their programming lan-
guage or ecosystem of choice, may need to resort to using a so-called binding
library (or binding). Bindings provide support across programming languages
and package ecosystems for reusing a host library. For example, the Keras
.NET binding provides support for the Keras library in the NuGet (.NET)
ecosystem even though the Keras library was written in Python. In this pa-
per, we collect 2,436 cross-ecosystem bindings for 546 ML libraries across 13
software package ecosystems by using an approach called BindFind, which can
automatically identify bindings and link them to their host libraries. Further-
more, we conduct an in-depth study of 133 cross-ecosystem bindings and their
development for 40 popular open source ML libraries. Our findings reveal that
the majority of ML library bindings are maintained by the community, with
npm being the most popular ecosystem for these bindings. Our study also
indicates that most bindings cover only a limited range of the host library’s
releases, often experience considerable delays in supporting new releases, and
have widespread technical lag. Our findings highlight key factors to consider
for developers integrating bindings for ML libraries and open avenues for re-
searchers to further investigate bindings in software package ecosystems.},
keywords = {Library bindings, Machine learning, SE4AI, SE4ML},
pubstate = {published},
tppubtype = {article}
}
integrate advanced ML functionality into their own applications. However,
popular ML libraries, such as TensorFlow, are not available natively in all
programming languages and software package ecosystems. Hence, developers
who wish to use an ML library which is not available in their programming lan-
guage or ecosystem of choice, may need to resort to using a so-called binding
library (or binding). Bindings provide support across programming languages
and package ecosystems for reusing a host library. For example, the Keras
.NET binding provides support for the Keras library in the NuGet (.NET)
ecosystem even though the Keras library was written in Python. In this pa-
per, we collect 2,436 cross-ecosystem bindings for 546 ML libraries across 13
software package ecosystems by using an approach called BindFind, which can
automatically identify bindings and link them to their host libraries. Further-
more, we conduct an in-depth study of 133 cross-ecosystem bindings and their
development for 40 popular open source ML libraries. Our findings reveal that
the majority of ML library bindings are maintained by the community, with
npm being the most popular ecosystem for these bindings. Our study also
indicates that most bindings cover only a limited range of the host library’s
releases, often experience considerable delays in supporting new releases, and
have widespread technical lag. Our findings highlight key factors to consider
for developers integrating bindings for ML libraries and open avenues for re-
searchers to further investigate bindings in software package ecosystems.
Mohammad Reza Taesiri
Leveraging Foundation Models for Video Game Quality Assurance PhD Thesis
2024.
Abstract | BibTeX | Tags: Computer games, Computer vision, Game development, Game testing, Gameplay videos, Machine learning, Software quality
@phdthesis{phd_taesiri,
title = {Leveraging Foundation Models for Video Game Quality Assurance},
author = {Mohammad Reza Taesiri },
year = {2024},
date = {2024-09-25},
abstract = {The video game industry has become a powerhouse in the global entertainment econ-
omy. Creating engaging, high-quality games demands intricate development processes
and significant resources. As projects grow in complexity and scale, developers often
grapple with demanding schedules, tight deadlines, and the risk of burnout. These
pressures highlight the need for more efficient development strategies, with quality
assurance (QA) emerging as a critical area for optimization.
Artificial Intelligence (AI) has the potential to address these challenges by en-
hancing the game QA processes in large gaming companies. Specifically, foundation
models—large pre-trained AI models—offer promising applications to improve these
processes. Exploring novel uses of these advanced AI models could reveal their poten-
tial and limitations in optimizing game development workflows, potentially alleviating
some of the industry’s pressing issues and facilitating the creation of high-quality, en-
gaging games.
In this thesis, my goal is to improve video game testing processes by leveraging
foundation models to ensure the final product reaches a desirable quality. I explore
new opportunities that foundation models bring to game testing, from searching for
instances of game bugs within video repositories to assisting human testers in catching
bugs, through three studies:
First, I investigate the utility of image-text foundation models in retrieving game-
play videos. In this study, I create a video search engine designed to help developers
efficiently search video repositories for examples of video game bugs using textual
descriptions. For example, developers can find all instances of a bug by using a tex-
tual description of the bug, such as a horse flying in the air. This study lays the
groundwork for AI-based game QA processes, with results demonstrating significant
potential.
Next, I introduce GlitchBench, a benchmarking dataset of video game glitches
and anomalies designed to assess state-of-the-art large multimodal models, such as
GPT-4V, in detecting and understanding game bugs. This extensive dataset includes
a wide range of images depicting various glitches, sourced from both online platforms
and synthetic sets created within the Unity game engine. GlitchBench includes both
common and rare glitches encountered in the video game quality assurance process.
The findings from this study highlight both the promise and limitations of existing
models, particularly in unusual and rare cases.
Lastly, I introduce VideoGameBunny, a large multimodal model specifically
trained for video game content, accompanied by a dataset of 389,565 image-instruction
pairs. My analysis demonstrates that VideoGameBunny outperforms much larger
models in video game understanding tasks while using 4.2× fewer parameters. This
result underscores the effectiveness and promise of using a high-quality dataset to
improve models’ understanding of video games, thus making them more effective in
the game QA process.
Future work should focus on enhancing the generalization and robustness of AI
models in the gaming context, particularly through better integration of vision and
language components. This integration could be achieved using either early or late fu-
sion methods. For late fusion methods, where two pre-trained models are connected,
better alignment between these components can be achieved through improved train-
ing data and strategies. Alternatively, early fusion techniques, which involve training
both components simultaneously to enhance their integration, can overcome many
issues that existing models have.},
keywords = {Computer games, Computer vision, Game development, Game testing, Gameplay videos, Machine learning, Software quality},
pubstate = {published},
tppubtype = {phdthesis}
}
omy. Creating engaging, high-quality games demands intricate development processes
and significant resources. As projects grow in complexity and scale, developers often
grapple with demanding schedules, tight deadlines, and the risk of burnout. These
pressures highlight the need for more efficient development strategies, with quality
assurance (QA) emerging as a critical area for optimization.
Artificial Intelligence (AI) has the potential to address these challenges by en-
hancing the game QA processes in large gaming companies. Specifically, foundation
models—large pre-trained AI models—offer promising applications to improve these
processes. Exploring novel uses of these advanced AI models could reveal their poten-
tial and limitations in optimizing game development workflows, potentially alleviating
some of the industry’s pressing issues and facilitating the creation of high-quality, en-
gaging games.
In this thesis, my goal is to improve video game testing processes by leveraging
foundation models to ensure the final product reaches a desirable quality. I explore
new opportunities that foundation models bring to game testing, from searching for
instances of game bugs within video repositories to assisting human testers in catching
bugs, through three studies:
First, I investigate the utility of image-text foundation models in retrieving game-
play videos. In this study, I create a video search engine designed to help developers
efficiently search video repositories for examples of video game bugs using textual
descriptions. For example, developers can find all instances of a bug by using a tex-
tual description of the bug, such as a horse flying in the air. This study lays the
groundwork for AI-based game QA processes, with results demonstrating significant
potential.
Next, I introduce GlitchBench, a benchmarking dataset of video game glitches
and anomalies designed to assess state-of-the-art large multimodal models, such as
GPT-4V, in detecting and understanding game bugs. This extensive dataset includes
a wide range of images depicting various glitches, sourced from both online platforms
and synthetic sets created within the Unity game engine. GlitchBench includes both
common and rare glitches encountered in the video game quality assurance process.
The findings from this study highlight both the promise and limitations of existing
models, particularly in unusual and rare cases.
Lastly, I introduce VideoGameBunny, a large multimodal model specifically
trained for video game content, accompanied by a dataset of 389,565 image-instruction
pairs. My analysis demonstrates that VideoGameBunny outperforms much larger
models in video game understanding tasks while using 4.2× fewer parameters. This
result underscores the effectiveness and promise of using a high-quality dataset to
improve models’ understanding of video games, thus making them more effective in
the game QA process.
Future work should focus on enhancing the generalization and robustness of AI
models in the gaming context, particularly through better integration of vision and
language components. This integration could be achieved using either early or late fu-
sion methods. For late fusion methods, where two pre-trained models are connected,
better alignment between these components can be achieved through improved train-
ing data and strategies. Alternatively, early fusion techniques, which involve training
both components simultaneously to enhance their integration, can overcome many
issues that existing models have.
Hao Li
Investigating the Quality of Bindings for Machine Learning Libraries in Software Package Ecosystems PhD Thesis
2024.
Abstract | BibTeX | Tags: Machine learning, Software Ecosystem, Software quality
@phdthesis{phd_haoli,
title = {Investigating the Quality of Bindings for Machine Learning Libraries in Software Package Ecosystems},
author = {Hao Li },
year = {2024},
date = {2024-08-21},
urldate = {2024-08-21},
abstract = {Machine learning (ML) has revolutionized many domains, with developers often re-
lying on open source ML libraries to integrate ML capabilities into their projects.
However, these libraries primarily support a single programming language, limiting
their availability for projects in other languages. Bindings serve as bridges between
programming languages by providing interfaces to ML libraries. This thesis investi-
gates the quality of bindings for ML libraries in software package ecosystems, focusing
on their maintenance and software quality.
The first study presented in this thesis introduces BindFind, an automated ap-
proach to identify bindings and link them with their corresponding host libraries
across various software package ecosystems. By analyzing 2,436 bindings for 546 ML
libraries, we find that most bindings are community-maintained, with npm being the
most popular choice for publishing these bindings. The analysis reveals that these
bindings usually cover a limited range of releases from their host library and experi-
ence significant delays in supporting new releases.
In the second study, we investigate the usage and rationale behind release-level
deprecation in bindings for ML libraries within the Cargo and npm ecosystems. We
discover that bindings in Cargo have a higher percentage of deprecated releases com-
pared to general packages, while the percentages of deprecated releases and general
packages are similar in npm. The primary reasons for deprecation are package re-
moval or replacement and defects in both ecosystems. We also identify the issue of
implicitly deprecated releases in Cargo due to deprecation propagation through the
dependency network.
The third study evaluates the impact of using different bindings on the software
quality of ML systems through experiments on model training and inference using
TensorFlow and PyTorch across four programming languages. The results show that
models trained with one binding perform consistently in inference tasks when utilized
with another binding. Furthermore, non-default bindings can outperform the default
Python bindings in specific tasks without sacrificing accuracy. We also find significant
differences in inference times across bindings, highlighting the benefits of choosing ap-
propriate bindings based on specific performance requirements to maximize efficiency
in ML projects.
The work presented in this thesis provides deep insights, actionable recommenda-
tions, and effective and thoroughly evaluated approaches for assessing and improving
the quality of bindings for ML libraries in software package ecosystems.},
keywords = {Machine learning, Software Ecosystem, Software quality},
pubstate = {published},
tppubtype = {phdthesis}
}
lying on open source ML libraries to integrate ML capabilities into their projects.
However, these libraries primarily support a single programming language, limiting
their availability for projects in other languages. Bindings serve as bridges between
programming languages by providing interfaces to ML libraries. This thesis investi-
gates the quality of bindings for ML libraries in software package ecosystems, focusing
on their maintenance and software quality.
The first study presented in this thesis introduces BindFind, an automated ap-
proach to identify bindings and link them with their corresponding host libraries
across various software package ecosystems. By analyzing 2,436 bindings for 546 ML
libraries, we find that most bindings are community-maintained, with npm being the
most popular choice for publishing these bindings. The analysis reveals that these
bindings usually cover a limited range of releases from their host library and experi-
ence significant delays in supporting new releases.
In the second study, we investigate the usage and rationale behind release-level
deprecation in bindings for ML libraries within the Cargo and npm ecosystems. We
discover that bindings in Cargo have a higher percentage of deprecated releases com-
pared to general packages, while the percentages of deprecated releases and general
packages are similar in npm. The primary reasons for deprecation are package re-
moval or replacement and defects in both ecosystems. We also identify the issue of
implicitly deprecated releases in Cargo due to deprecation propagation through the
dependency network.
The third study evaluates the impact of using different bindings on the software
quality of ML systems through experiments on model training and inference using
TensorFlow and PyTorch across four programming languages. The results show that
models trained with one binding perform consistently in inference tasks when utilized
with another binding. Furthermore, non-default bindings can outperform the default
Python bindings in specific tasks without sacrificing accuracy. We also find significant
differences in inference times across bindings, highlighting the benefits of choosing ap-
propriate bindings based on specific performance requirements to maximize efficiency
in ML projects.
The work presented in this thesis provides deep insights, actionable recommenda-
tions, and effective and thoroughly evaluated approaches for assessing and improving
the quality of bindings for ML libraries in software package ecosystems.
Ian Gauk; Cor-Paul Bezemer
Detecting Discrepancies between Subtitles and Audio in Gameplay Videos with EchoTest Journal Article
IEEE Transactions on Games, 2024.
Abstract | BibTeX | Tags: Accessibility testing, Computer games, Game accessibility, Game development, Game testing
@article{Gauk_EchoTest,
title = {Detecting Discrepancies between Subtitles and Audio in Gameplay Videos with EchoTest},
author = {Ian Gauk and Cor-Paul Bezemer},
year = {2024},
date = {2024-07-30},
journal = {IEEE Transactions on Games},
abstract = {The landscape of accessibility features in video
games remains inconsistent, posing challenges for gamers who
seek experiences tailored to their needs. Accessibility features
such as subtitles are widely used by players but are difficult to
test manually due to the large scope of games and the variability
in how subtitles can appear.
In this paper, we introduce an automated approach
(ECHOTEST) to extract subtitles and spoken audio from a
gameplay video, convert them into text, and compare them to
detect discrepancies such as typos, desynchronization and missing
text. ECHOTEST can be used by game developers to identify
discrepancies between subtitles and spoken audio in their games,
enabling them to better test the accessibility of their games.
In an empirical study on gameplay videos from 15 popular
games, ECHOTEST can verify discrepancies between subtitles and
audio with a precision of 98% and a recall of 89%. In addition,
ECHOTEST performs well with a precision of 73% and a recall
of 99% on a challenging generated benchmark.},
keywords = {Accessibility testing, Computer games, Game accessibility, Game development, Game testing},
pubstate = {published},
tppubtype = {article}
}
games remains inconsistent, posing challenges for gamers who
seek experiences tailored to their needs. Accessibility features
such as subtitles are widely used by players but are difficult to
test manually due to the large scope of games and the variability
in how subtitles can appear.
In this paper, we introduce an automated approach
(ECHOTEST) to extract subtitles and spoken audio from a
gameplay video, convert them into text, and compare them to
detect discrepancies such as typos, desynchronization and missing
text. ECHOTEST can be used by game developers to identify
discrepancies between subtitles and spoken audio in their games,
enabling them to better test the accessibility of their games.
In an empirical study on gameplay videos from 15 popular
games, ECHOTEST can verify discrepancies between subtitles and
audio with a precision of 98% and a recall of 89%. In addition,
ECHOTEST performs well with a precision of 73% and a recall
of 99% on a challenging generated benchmark.
Hao Li; Gopi Krishnan Rajbahadur; Cor-Paul Bezemer
Studying the Impact of TensorFlow and PyTorch Bindings on Machine Learning Software Quality Journal Article
ACM Transactions on Software Engineering and Methodology, 2024.
Abstract | BibTeX | Tags: Library bindings, Machine learning, SE4AI, SE4ML, Software quality
@article{Li_BindingsQuality,
title = {Studying the Impact of TensorFlow and PyTorch Bindings on Machine Learning Software Quality},
author = {Hao Li and Gopi Krishnan Rajbahadur and Cor-Paul Bezemer},
year = {2024},
date = {2024-07-07},
journal = {ACM Transactions on Software Engineering and Methodology},
abstract = {Bindings for machine learning frameworks (such as TensorFlow and PyTorch) allow developers to integrate a framework’s functionality using a programming language different from the framework’s default language (usually Python). In this paper, we study the impact of using TensorFlow and PyTorch bindings in C#, Rust, Python and JavaScript on the software quality in terms of correctness (training and test accuracy) and time cost (training and inference time) when training and performing inference on five widely used deep learning models. Our experiments show that a model can be trained in one binding and used for inference in another binding for the same framework without losing accuracy. Our study is the first to show that using a non-default binding can help improve machine learning software quality from the time cost perspective compared to the default Python binding while still achieving the same level of correctness.},
keywords = {Library bindings, Machine learning, SE4AI, SE4ML, Software quality},
pubstate = {published},
tppubtype = {article}
}
Mohammad Reza Taesiri; Tianjun Feng; Anh Nguyen; Cor-Paul Bezemer
GlitchBench: Can large multimodal models detect video game glitches? Inproceedings
IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024.
Abstract | BibTeX | Tags: Computer games, Foundation models, Game development, Gameplay videos, LLM
@inproceedings{TaesiriCVPR2024,
title = {GlitchBench: Can large multimodal models detect video game glitches?},
author = {Mohammad Reza Taesiri and Tianjun Feng and Anh Nguyen and Cor-Paul Bezemer},
year = {2024},
date = {2024-06-15},
urldate = {2024-03-15},
booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
abstract = {Large multimodal models (LMMs) have evolved from
large language models (LLMs) to integrate multiple input
modalities, such as visual inputs. This integration augments
the capacity of LLMs for tasks requiring visual comprehen-
sion and reasoning. However, the extent and limitations of
their enhanced abilities are not fully understood, especially
when it comes to real-world tasks. To address this gap, we
introduce GlitchBench, a novel benchmark derived from
video game quality assurance tasks, to test and evaluate the
reasoning capabilities of LMMs. Our benchmark is curated
from a variety of unusual and glitched scenarios from video
games and aims to challenge both the visual and linguis-
tic reasoning powers of LMMs in detecting and interpreting
out-of-the-ordinary events. We evaluate multiple state-of-
the-art LMMs, and we show that GlitchBench presents a
new challenge for these models. Code and data are avail-
able at: https://glitchbench.github.io/},
keywords = {Computer games, Foundation models, Game development, Gameplay videos, LLM},
pubstate = {published},
tppubtype = {inproceedings}
}
large language models (LLMs) to integrate multiple input
modalities, such as visual inputs. This integration augments
the capacity of LLMs for tasks requiring visual comprehen-
sion and reasoning. However, the extent and limitations of
their enhanced abilities are not fully understood, especially
when it comes to real-world tasks. To address this gap, we
introduce GlitchBench, a novel benchmark derived from
video game quality assurance tasks, to test and evaluate the
reasoning capabilities of LMMs. Our benchmark is curated
from a variety of unusual and glitched scenarios from video
games and aims to challenge both the visual and linguis-
tic reasoning powers of LMMs in detecting and interpreting
out-of-the-ordinary events. We evaluate multiple state-of-
the-art LMMs, and we show that GlitchBench presents a
new challenge for these models. Code and data are avail-
able at: https://glitchbench.github.io/
Hao Li; Gopi Krishnan Rajbahadur; Dayi Lin; Cor-Paul Bezemer; Zhen Ming (Jack) Jiang
Keeping Deep Learning Models in Check: A History-Based Approach to Mitigate Overfitting Journal Article
IEEE Access, 12 , pp. 70676–70689, 2024.
Abstract | BibTeX | Tags: Machine learning, Overfitting
@article{Li_Overfitting,
title = {Keeping Deep Learning Models in Check: A History-Based Approach to Mitigate Overfitting},
author = {Hao Li and Gopi Krishnan Rajbahadur and Dayi Lin and Cor-Paul Bezemer and Zhen Ming (Jack) Jiang},
year = {2024},
date = {2024-05-17},
journal = {IEEE Access},
volume = {12},
pages = {70676--70689},
abstract = {In software engineering, deep learning models are increasingly deployed for critical tasks such as bug detection and code review. However, overfitting remains a challenge that affects the quality, reliability, and trustworthiness of software systems that utilize deep learning models. Overfitting can be (1) prevented (e.g., using dropout or early stopping) or (2) detected in a trained model (e.g., using correlation-based approaches). Both overfitting detection and prevention approaches that are currently used have constraints (e.g., requiring modification of the model structure, and high computing resources). In this paper, we propose a simple, yet powerful approach that can both detect and prevent overfitting based on the training history (i.e., validation losses). Our approach first trains a time series classifier on training histories of overfit models. This classifier is then used to detect if a trained model is overfit. In addition, our trained classifier can be used to prevent overfitting by identifying the optimal point to stop a model’s training. We evaluate our approach on its ability to identify and prevent overfitting in real-world samples. We compare our approach against correlation-based detection approaches and the most commonly used prevention approach (i.e., early stopping). Our approach achieves an F1 score of 0.91 which is at least 5% higher than the current best-performing non-intrusive overfitting detection approach. Furthermore, our approach can stop training to avoid overfitting at least 32% of the times earlier than early stopping and has the same or a better rate of returning the best model.},
keywords = {Machine learning, Overfitting},
pubstate = {published},
tppubtype = {article}
}
Balreet Grewal; Wentao Lu; Sarah Nadi; Cor-Paul Bezemer
Analyzing Developer Use of ChatGPT Generated Code in Open Source GitHub Projects Inproceedings
International Conference on Mining Software Repositories (MSR), 2024.
Abstract | BibTeX | Tags: Code reuse, LLM, SE4AI
@inproceedings{GrewalMSR2024,
title = {Analyzing Developer Use of ChatGPT Generated Code in Open Source GitHub Projects},
author = {Balreet Grewal and Wentao Lu and Sarah Nadi and Cor-Paul Bezemer },
year = {2024},
date = {2024-04-14},
urldate = {2024-04-14},
booktitle = {International Conference on Mining Software Repositories (MSR)},
abstract = {The rapid development of large language models such as ChatGPT
have made them particularly useful to developers in generating
code snippets for their projects. To understand how ChatGPT’s
generated code is leveraged by developers, we conducted an em-
pirical study of 3,044 ChatGPT-generated code snippets integrated
within GitHub projects. A median of 54% of the generated lines of
code is found in the project’s code and this code typically remains
unchanged once added. The modifications of the 76 code snippets
that changed in a subsequent commit, consisted of minor function-
ality changes and code reorganizations that were made within a
day. Our findings offer insights that help drive the development
of AI-assisted programming tools. We highlight the importance
of making changes in ChatGPT code before integrating it into a
project.},
keywords = {Code reuse, LLM, SE4AI},
pubstate = {published},
tppubtype = {inproceedings}
}
have made them particularly useful to developers in generating
code snippets for their projects. To understand how ChatGPT’s
generated code is leveraged by developers, we conducted an em-
pirical study of 3,044 ChatGPT-generated code snippets integrated
within GitHub projects. A median of 54% of the generated lines of
code is found in the project’s code and this code typically remains
unchanged once added. The modifications of the 76 code snippets
that changed in a subsequent commit, consisted of minor function-
ality changes and code reorganizations that were made within a
day. Our findings offer insights that help drive the development
of AI-assisted programming tools. We highlight the importance
of making changes in ChatGPT code before integrating it into a
project.
Mikael Sabuhi; Petr Musilek; Cor-Paul Bezemer
Micro-FL: A Fault-Tolerant Scalable Microservice Based Platform for Federated Learning Journal Article
Future Internet, 16 (3), pp. 1-19, 2024.
Abstract | BibTeX | Tags: Federated learning, Machine learning, Microservices
@article{Sabuhi_MicroFL,
title = {Micro-FL: A Fault-Tolerant Scalable Microservice Based Platform for Federated Learning},
author = {Mikael Sabuhi and Petr Musilek and Cor-Paul Bezemer },
year = {2024},
date = {2024-02-19},
journal = {Future Internet},
volume = {16},
number = {3},
pages = {1-19},
abstract = {As the number of machine learning applications increases, growing concerns about data privacy expose the limitations of traditional cloud-based machine learning methods that rely on centralized data collection and processing. Federated learning emerges as a promising alternative, offering a novel approach to training machine learning models that safeguards data privacy. Federated learning facilitates collaborative model training across various entities. In this approach, each user trains models locally and shares only the local model parameters with a central server, which then generates a global model based on these individual updates. This approach ensures data privacy since the training data itself is never directly shared with a central entity. However, existing federated machine learning frameworks are not without challenges. In terms of server design, these frameworks exhibit limited scalability with an increasing number of clients and are highly vulnerable to system faults, particularly as the central server becomes a single point of failure. This paper introduces Micro-FL, a federated learning framework that uses a microservices architecture to implement the federated learning system. It demonstrates that the framework is fault-tolerant and scalable, showing its ability to handle an increasing number of clients. A comprehensive performance evaluation confirms that Micro-FL proficiently handles component faults, enabling a smooth and uninterrupted operation.},
keywords = {Federated learning, Machine learning, Microservices},
pubstate = {published},
tppubtype = {article}
}
Tajkia Rahman Toma; Cor-Paul Bezemer
An Exploratory Study of Dataset and Model Management in Open Source Machine Learning Applications Inproceedings
3rd IEEE/ACM International Conference on AI Engineering - Software Engineering for AI (CAIN), pp. 1–11, 2024.
Abstract | BibTeX | Tags: Data maintenance, SE4ML
@inproceedings{TomaCAIN2024,
title = {An Exploratory Study of Dataset and Model Management in Open Source Machine Learning Applications},
author = {Tajkia Rahman Toma and Cor-Paul Bezemer},
year = {2024},
date = {2024-01-17},
urldate = {2024-01-17},
booktitle = {3rd IEEE/ACM International Conference on AI Engineering - Software Engineering for AI (CAIN)},
pages = {1--11},
abstract = {Datasets and models are two key artifacts in machine learning
(ML) applications. Although there exist tools to support dataset
and model developers in managing ML artifacts, little is known
about how these datasets and models are integrated into ML ap-
plications. In this paper, we study how datasets and models in ML
applications are managed. In particular, we focus on how these
artifacts are stored and versioned alongside the applications. After
analyzing 93 repositories, we identified the most common storage
location to store datasets and models is the file system, which causes
availability issues. Notably, large data and model files, exceeding
approximately 60 MB, are stored exclusively in remote storage and
downloaded as needed. Most of the datasets and models lack proper
integration with the version control system, posing potential trace-
ability and reproducibility issues. Additionally, although datasets
and models are likely to evolve during the application development,
they are rarely updated in application repositories.},
keywords = {Data maintenance, SE4ML},
pubstate = {published},
tppubtype = {inproceedings}
}
(ML) applications. Although there exist tools to support dataset
and model developers in managing ML artifacts, little is known
about how these datasets and models are integrated into ML ap-
plications. In this paper, we study how datasets and models in ML
applications are managed. In particular, we focus on how these
artifacts are stored and versioned alongside the applications. After
analyzing 93 repositories, we identified the most common storage
location to store datasets and models is the file system, which causes
availability issues. Notably, large data and model files, exceeding
approximately 60 MB, are stored exclusively in remote storage and
downloaded as needed. Most of the datasets and models lack proper
integration with the version control system, posing potential trace-
ability and reproducibility issues. Additionally, although datasets
and models are likely to evolve during the application development,
they are rarely updated in application repositories.
Mohammad Reza Taesiri; Finlay Macklon; Sarra Habchi; Cor-Paul Bezemer
Searching bug instances in gameplay video repositories Journal Article
IEEE Transactions on Games, 2024.
Abstract | BibTeX | Tags: Bug report, Computer games, Game development, Gameplay videos, Gaming
@article{TaesiriTG2024,
title = {Searching bug instances in gameplay video repositories},
author = {Mohammad Reza Taesiri and Finlay Macklon and Sarra Habchi and Cor-Paul Bezemer},
year = {2024},
date = {2024-01-17},
urldate = {2024-01-17},
journal = {IEEE Transactions on Games},
abstract = {Gameplay videos offer valuable insights into player interactions and game responses, particularly data about game bugs.
Despite the abundance of gameplay videos online, extracting useful information remains a challenge. This paper introduces a method
for searching and extracting relevant videos from extensive video repositories using English text queries. Our approach requires no
external information, like video metadata; it solely depends on video content. Leveraging the zero-shot transfer capabilities of the
Contrastive Language-Image Pre-Training (CLIP) model, our approach does not require any data labeling or training. To evaluate our
approach, we present the GamePhysics dataset, comprising 26,954 videos from 1,873 games that were collected from the
GamePhysics section on the Reddit website. Our approach shows promising results in our extensive analysis of simple and compound
queries, indicating that our method is useful for detecting objects and events in gameplay videos. Moreover, we assess the
effectiveness of our method by analyzing a carefully annotated dataset of 220 gameplay videos. The results of our study demonstrate
the potential of our approach for applications such as the creation of a video search tool tailored to identifying video game bugs, which
could greatly benefit Quality Assurance (QA) teams in finding and reproducing bugs. The code and data used in this paper can be
found at https://zenodo.org/records/10211390},
keywords = {Bug report, Computer games, Game development, Gameplay videos, Gaming},
pubstate = {published},
tppubtype = {article}
}
Despite the abundance of gameplay videos online, extracting useful information remains a challenge. This paper introduces a method
for searching and extracting relevant videos from extensive video repositories using English text queries. Our approach requires no
external information, like video metadata; it solely depends on video content. Leveraging the zero-shot transfer capabilities of the
Contrastive Language-Image Pre-Training (CLIP) model, our approach does not require any data labeling or training. To evaluate our
approach, we present the GamePhysics dataset, comprising 26,954 videos from 1,873 games that were collected from the
GamePhysics section on the Reddit website. Our approach shows promising results in our extensive analysis of simple and compound
queries, indicating that our method is useful for detecting objects and events in gameplay videos. Moreover, we assess the
effectiveness of our method by analyzing a carefully annotated dataset of 220 gameplay videos. The results of our study demonstrate
the potential of our approach for applications such as the creation of a video search tool tailored to identifying video game bugs, which
could greatly benefit Quality Assurance (QA) teams in finding and reproducing bugs. The code and data used in this paper can be
found at https://zenodo.org/records/10211390
Mohammad Reza Taesiri; Giang Nguyen; Sarra Habchi; Cor-Paul Bezemer; Anh Nguyen
ImageNet-Hard: The Hardest Images Remaining from a Study of the Power of Zoom and Spatial Biases in Image Classification Inproceedings
NeurIPS Dataset and Benchmark track, 2023.
BibTeX | Tags: Benchmark, Computer vision, Dataset, Image classification, Machine learning
@inproceedings{TaesiriNeurIPS2023,
title = {ImageNet-Hard: The Hardest Images Remaining from a Study of the Power of Zoom and Spatial Biases in Image Classification},
author = {Mohammad Reza Taesiri and Giang Nguyen and Sarra Habchi and Cor-Paul Bezemer and Anh Nguyen},
year = {2023},
date = {2023-12-07},
urldate = {2023-12-07},
booktitle = {NeurIPS Dataset and Benchmark track},
keywords = {Benchmark, Computer vision, Dataset, Image classification, Machine learning},
pubstate = {published},
tppubtype = {inproceedings}
}
Markos Viggiato; Dale Paas; Cor-Paul Bezemer
Prioritizing Natural Language Test Cases Based on Highly-Used Game Features Inproceedings
Proceedings of the 31st Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE), pp. 1–12, 2023.
Abstract | BibTeX | Tags: Computer games, Game development, Natural language processing, Testing
@inproceedings{ViggiatoFSE2023,
title = {Prioritizing Natural Language Test Cases Based on Highly-Used Game Features},
author = {Markos Viggiato and Dale Paas and Cor-Paul Bezemer },
year = {2023},
date = {2023-12-01},
urldate = {2023-12-01},
booktitle = {Proceedings of the 31st Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)},
pages = {1--12},
abstract = {Software testing is still a manual activity in many industries, such
as the gaming industry. But manually executing tests becomes im-
practical as the system grows and resources are restricted, mainly
in a scenario with short release cycles. Test case prioritization is a
commonly used technique to optimize the test execution. However,
most prioritization approaches do not work for manual test cases
as they require source code information or test execution history,
which is often not available in a manual testing scenario. In this
paper, we propose a prioritization approach for manual test cases
written in natural language based on the tested application features
(in particular, highly-used application features). Our approach con-
sists of (1) identifying the tested features from natural language test
cases (with zero-shot classification techniques) and (2) prioritizing
test cases based on the features that they test. We leveraged the
NSGA-II genetic algorithm for the multi-objective optimization of
the test case ordering to maximize the coverage of highly-used
features while minimizing the cumulative execution time. Our find-
ings show that we can successfully identify the application features
covered by test cases using an ensemble of pre-trained models
with strong zero-shot capabilities (an F-score of 76.1%). Also, our
prioritization approaches can find test case orderings that cover
highly-used application features early in the test execution while
keeping the time required to execute test cases short. QA engineers
can use our approach to focus the test execution on test cases that
cover features that are relevant to users.},
keywords = {Computer games, Game development, Natural language processing, Testing},
pubstate = {published},
tppubtype = {inproceedings}
}
as the gaming industry. But manually executing tests becomes im-
practical as the system grows and resources are restricted, mainly
in a scenario with short release cycles. Test case prioritization is a
commonly used technique to optimize the test execution. However,
most prioritization approaches do not work for manual test cases
as they require source code information or test execution history,
which is often not available in a manual testing scenario. In this
paper, we propose a prioritization approach for manual test cases
written in natural language based on the tested application features
(in particular, highly-used application features). Our approach con-
sists of (1) identifying the tested features from natural language test
cases (with zero-shot classification techniques) and (2) prioritizing
test cases based on the features that they test. We leveraged the
NSGA-II genetic algorithm for the multi-objective optimization of
the test case ordering to maximize the coverage of highly-used
features while minimizing the cumulative execution time. Our find-
ings show that we can successfully identify the application features
covered by test cases using an ensemble of pre-trained models
with strong zero-shot capabilities (an F-score of 76.1%). Also, our
prioritization approaches can find test case orderings that cover
highly-used application features early in the test execution while
keeping the time required to execute test cases short. QA engineers
can use our approach to focus the test execution on test cases that
cover features that are relevant to users.
Md Saeed Siddik; Cor-Paul Bezemer
Do Code Quality and Style Issues Differ Across (Non-)Machine Learning Notebooks? Yes! Inproceedings
23nd IEEE International Working Conference on Source Code Analysis and Manipulation (SCAM), pp. 1–12, IEEE, 2023.
Abstract | BibTeX | Tags: Computational notebooks, Empirical software engineering, Mining software repositories
@inproceedings{SiddikSCAM2023,
title = {Do Code Quality and Style Issues Differ Across (Non-)Machine Learning Notebooks? Yes!},
author = {Md Saeed Siddik and Cor-Paul Bezemer},
year = {2023},
date = {2023-10-03},
urldate = {2023-10-03},
booktitle = {23nd IEEE International Working Conference on Source Code Analysis and Manipulation (SCAM)},
pages = {1--12},
publisher = {IEEE},
abstract = {The popularity of computational notebooks is
rapidly increasing because of their interactive code-output vi-
sualization and on-demand non-sequential code block execution.
These notebook features have made notebooks especially popular
with machine learning developers and data scientists. However,
as prior work shows, notebooks generally contain low quality
code. In this paper, we investigate whether the low quality code
is inherent to the programming style in notebooks, or whether
it is correlated with the use of machine learning techniques.
We present a large-scale empirical analysis of 246,599 open-
source notebooks to explore how machine learning code quality
in Jupyter Notebooks differs from non-machine learning code,
thereby focusing on code style issues. We explored code style
issues across the Error, Convention, Warning, and Refactoring
categories. We found that machine learning notebooks are of
lower quality regarding PEP-8 code standards than non-machine
learning notebooks, and their code quality distributions signifi-
cantly differ with a small effect size. We identified several code
style issues with large differences in occurrences between machine
learning and non-machine learning notebooks. For example,
package and import-related issues are more prevalent in machine
learning notebooks. Our study shows that code quality and code
style issues differ significantly across machine learning and non-
machine learning notebooks.},
keywords = {Computational notebooks, Empirical software engineering, Mining software repositories},
pubstate = {published},
tppubtype = {inproceedings}
}
rapidly increasing because of their interactive code-output vi-
sualization and on-demand non-sequential code block execution.
These notebook features have made notebooks especially popular
with machine learning developers and data scientists. However,
as prior work shows, notebooks generally contain low quality
code. In this paper, we investigate whether the low quality code
is inherent to the programming style in notebooks, or whether
it is correlated with the use of machine learning techniques.
We present a large-scale empirical analysis of 246,599 open-
source notebooks to explore how machine learning code quality
in Jupyter Notebooks differs from non-machine learning code,
thereby focusing on code style issues. We explored code style
issues across the Error, Convention, Warning, and Refactoring
categories. We found that machine learning notebooks are of
lower quality regarding PEP-8 code standards than non-machine
learning notebooks, and their code quality distributions signifi-
cantly differ with a small effect size. We identified several code
style issues with large differences in occurrences between machine
learning and non-machine learning notebooks. For example,
package and import-related issues are more prevalent in machine
learning notebooks. Our study shows that code quality and code
style issues differ significantly across machine learning and non-
machine learning notebooks.
Mikael Sabuhi
Strategies For Building Performant Containerized Applications PhD Thesis
2023.
Abstract | BibTeX | Tags: Docker, Docker Hub, Microservices, Performance, Performance analysis, Performance engineering
@phdthesis{phd_mikael,
title = {Strategies For Building Performant Containerized Applications},
author = {Mikael Sabuhi},
year = {2023},
date = {2023-09-25},
urldate = {2023-09-25},
abstract = {The evolution of cloud computing in the last decade has offered unprecedented access to sizable, configurable computing resources with minimal management effort. Containerization of applications, particularly through Docker, has been pivotal in this progression. As modern software increasingly relies on various cloud services, designing performant cloud applications has emerged as a critical concern. Key attributes of such applications include reliability, scalability, efficiency, fault tolerance, and responsiveness. This thesis seeks to address the challenges intrinsic to creating performant cloud applications by developing strategies aimed at achieving these characteristics through: 1) the application of autoscaling techniques to enhance scalability, efficiency, and responsiveness; 2) the introduction of a methodology for assessing the impact of Docker image upgrades on containerized applications to prevent performance degradation; and 3) the utilization of microservices architecture to develop scalable, reliable, and fault-tolerant cloud applications. In our initial research, we propose a pioneering approach to optimize the performance and resource usage of containerized cloud applications using adaptive controllers grounded in control theory. Our methodology harnesses the capacity of neural networks to capture the intrinsic non-linearity of these applications, and adapts the parameters of a proportional-integral-derivative (PID) controller to accommodate environmental changes. The outcomes demonstrate significant enhancements in resource utilization and a reduction in service level agreement violations, surpassing the performance of other examined autoscaling techniques. In the subsequent study, we present a method to evaluate the performance implications of Docker image upgrades on cloud software systems and their correlation with application dependencies. Our case study of 90 official WordPress images underscores the need for comprehensive performance testing before upgrades, the importance of maintaining a performance repository for reporting test results, and the potential benefits of extending semantic versioning to encompass performance modifications. This investigation encourages an enlightened approach to Docker image management, promoting enhanced cloud application performance. Lastly, we introduce Micro-FL, a fault-tolerant federated learning framework crafted to enhance the reliability and scalability of cloud-based machine learning platforms. By incorporating a microservices-based architecture within Docker containers, Micro-FL overcomes challenges typically associated with federated learning, such as resource constraints, scalability, and system faults. Performance assessments demonstrate Micro-FL’s capability to efficiently manage faults and streamline federated learning processes, offering a more robust and scalable solution for federated learning. The research work presented in this thesis provides deep insights, actionable recommendations, and effective and thoroughly evaluated approaches for building performant cloud applications.
},
keywords = {Docker, Docker Hub, Microservices, Performance, Performance analysis, Performance engineering},
pubstate = {published},
tppubtype = {phdthesis}
}
Finlay Macklon; Markos Viggiato; Natalia Romanova; Chris Buzon; Dale Paas; Cor-Paul Bezemer
A Taxonomy of Testable HTML5 Canvas Issues Journal Article
Transactions of Software Engineering (TSE), 49 (6), pp. 3647–3659, 2023.
Abstract | BibTeX | Tags: Testing, Web applications
@article{MacklonTSE2023,
title = {A Taxonomy of Testable HTML5 Canvas Issues},
author = {Finlay Macklon and Markos Viggiato and Natalia Romanova and Chris Buzon and Dale Paas and Cor-Paul Bezemer},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
journal = {Transactions of Software Engineering (TSE)},
volume = {49},
number = {6},
pages = {3647--3659},
abstract = {The HTML5 canvas is widely used to display high quality graphics in web applications. However, the combination of
web, GUI, and visual techniques that are required to build canvas applications, together with the lack of testing and debugging
tools, makes developing such applications very challenging. To help direct future research on testing canvas applications, in this
paper we present a taxonomy of testable canvas issues. First, we extracted 2,403 canvas related issue reports from 123 open
source GitHub projects that use the HTML5 canvas. Second, we constructed our taxonomy by manually classifying a random
sample of 332 issue reports. Our manual classification identified five broad categories of testable canvas issues, such as Visual
and Performance issues. We found that Visual issues are the most frequent (35%), while Performance issues are relatively infrequent
(5%). We also found that many testable canvas issues that present themselves visually on the canvas are actually caused by
other components of the web application. Our taxonomy of testable canvas issues can be used to steer future research into
canvas issues and testing.},
keywords = {Testing, Web applications},
pubstate = {published},
tppubtype = {article}
}
web, GUI, and visual techniques that are required to build canvas applications, together with the lack of testing and debugging
tools, makes developing such applications very challenging. To help direct future research on testing canvas applications, in this
paper we present a taxonomy of testable canvas issues. First, we extracted 2,403 canvas related issue reports from 123 open
source GitHub projects that use the HTML5 canvas. Second, we constructed our taxonomy by manually classifying a random
sample of 332 issue reports. Our manual classification identified five broad categories of testable canvas issues, such as Visual
and Performance issues. We found that Visual issues are the most frequent (35%), while Performance issues are relatively infrequent
(5%). We also found that many testable canvas issues that present themselves visually on the canvas are actually caused by
other components of the web application. Our taxonomy of testable canvas issues can be used to steer future research into
canvas issues and testing.