( = Paper PDF,
= Presentation slides,
= Presentation video)
Arthur V. Kamienski; Abram Hindle; Cor-Paul Bezemer
Analyzing Techniques for Duplicate Question Detection on Q&A Websites for Game Developers Journal Article
Empirical Software Engineering Journal (EMSE), 28 (17), 2022.
@article{arthur2022,
title = {Analyzing Techniques for Duplicate Question Detection on Q&A Websites for Game Developers},
author = {Arthur V. Kamienski and Abram Hindle and Cor-Paul Bezemer},
year = {2022},
date = {2022-12-08},
journal = {Empirical Software Engineering Journal (EMSE)},
volume = {28},
number = {17},
abstract = {Game development is currently the largest industry in the entertainment segment and has a high demand for skilled game developers that can produce high-quality games. To satiate this demand, game developers need resources that can provide them with the knowledge they need to learn and improve their skills. Question and Answer (Q&A) websites are one of such resources that provide a valuable source of knowledge about game development practices. However, the presence of duplicate questions on Q&A websites hinders their ability to effectively provide information for their users. While several researchers created and analyzed techniques for duplicate question detection on websites such as Stack Overflow, so far no studies have explored how well those techniques work on Q&A websites for game development. With that in mind, in this paper we analyze how we can use pre-trained and unsupervised techniques to detect duplicate questions on Q&A websites focused on game development using data extracted from the Game Development Stack Exchange and Stack Overflow. We also explore how we can leverage a small set of labelled data to improve the performance of those techniques. The pre-trained technique based on MPNet achieved the highest results in identifying duplicate questions about game development, and we could achieve a better performance when combining multiple unsupervised techniques into a single supervised model. Furthermore, the supervised models could identify duplicate questions on websites different from those they were trained on with little to no decrease in performance. Our results lay the groundwork for building better duplicate question detection systems in Q&A websites for game developers and ultimately providing game developers with a more effective Q&A community.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Finlay Macklon; Mohammad Reza Taesiri; Markos Viggiato; Stefan Antoszko; Natalia Romanova; Dale Paas; Cor-Paul Bezemer
Automatically Detecting Visual Bugs in HTML5 <canvas> Games Inproceedings
37th IEEE/ACM International Conference on Automated Software Engineering (ASE), 2022.
BibTeX | Tags: Computer games, Game development, Gaming, Regression testing, Testing, Web applications
@inproceedings{finlay_ase2022,
title = {Automatically Detecting Visual Bugs in HTML5
Luisa Palechor
Characterizing (un)successful open source blockchain projects and their testing practices Masters Thesis
2022.
Abstract | BibTeX | Tags: blockchain, Smart contracts, Testing
@mastersthesis{luisa2022,
title = {Characterizing (un)successful open source blockchain projects and their testing practices},
author = {Luisa Palechor},
year = {2022},
date = {2022-09-26},
urldate = {2022-09-26},
abstract = {The most well-known blockchain applications are cryptocurrencies, e.g., Ether and Bitcoin, which both sum a market cap of more than 560 billion US dollars. Besides cryptocurrency applications, programmable blockchain allows the development of different applications, e.g., peer-to-peer selling of renewable energy from smart grids, digital rights management, and supply chain tracking and operation. These applications can be developed and deployed on the blockchain through smart contracts, which are small programs that run on the blockchain under particular conditions. As bugs in blockchain applications (in particular, cryptocurrencies) can have large financial impact, it is important to ensure that these applications are well-developed and well-tested. However, currently software development and testing practices of blockchain projects are largely unstudied. In this thesis, we study data from GitHub and CoinMarketCap to understand the characteristics of successful and unsuccessful blockchain projects and reveal the testing practices in Solidity projects with the aim of helping developers to identify projects from which they can learn, or should contribute to. In the first part of the thesis, we study data from CoinMarketCap and GitHub to gain knowledge about the characteristics of successful and unsuccessful blockchain projects. We build a random forest classifier with 320 labelled projects and metrics from 3 dimensions (activity, popularity, and complexity). We found that a large number of stars and a project’s age can help distinguish between successful and unsuccessful projects. Additionally, we found that code cloning practices tend to be common in unsuccessful projects written in Python, C++, Java and Solidity. In the second part of the thesis, we explore how quality is addressed in blockchain applications by studying how 139 open source Solidity projects are tested. We show that core development team members are the developers who usually contribute to
testing files, leaving external contributions rare. In addition, our results indicate that only functional testing is practiced among the majority of Solidity projects, with Truffle and Hardhat being the tools commonly used to test Solidity smart contracts. Moreover, security testing is a practice rarely conducted, and performance testing is not conducted at all. We finally found that audits by a third party are common in several smart contracts. Future researchers and developers can use our findings to understand what characterizes successful and unsuccessful blockchain projects and be aware of the testing practices developers conduct in open source blockchain projects.},
keywords = {blockchain, Smart contracts, Testing},
pubstate = {published},
tppubtype = {mastersthesis}
}
testing files, leaving external contributions rare. In addition, our results indicate that only functional testing is practiced among the majority of Solidity projects, with Truffle and Hardhat being the tools commonly used to test Solidity smart contracts. Moreover, security testing is a practice rarely conducted, and performance testing is not conducted at all. We finally found that audits by a third party are common in several smart contracts. Future researchers and developers can use our findings to understand what characterizes successful and unsuccessful blockchain projects and be aware of the testing practices developers conduct in open source blockchain projects.
Markos Viggiato; Dale Paas; Chris Buzon; Cor-Paul Bezemer
Using Natural Language Processing Techniques to Improve Manual Test Case Descriptions Inproceedings
International Conference on Software Engineering - Software Engineering in Practice (ICSE - SEIP) Track, 2022.
@inproceedings{ViggiatoSEIP2022,
title = {Using Natural Language Processing Techniques to Improve Manual Test Case Descriptions},
author = {Markos Viggiato and Dale Paas and Chris Buzon and Cor-Paul Bezemer},
year = {2022},
date = {2022-05-08},
booktitle = {International Conference on Software Engineering - Software Engineering in Practice (ICSE - SEIP) Track},
abstract = {Despite the recent advancements in test automation, software test-
ing often remains a manual, and costly, activity in many industries.
Manual test cases, often described only in natural language, con-
sist of one or more test steps, which are instructions that must be
performed to achieve the testing objective. Having different em-
ployees specifying test cases might result in redundant, unclear, or
incomplete test cases. Manually reviewing and validating newly-
specified test cases is time-consuming and becomes impractical in a
scenario with a large test suite. Therefore, in this paper, we propose
an automated framework to automatically analyze test cases that
are specified in natural language and provide actionable recommen-
dations on how to improve the test cases. Our framework consists
of configurable components and modules for analysis, which are
capable of recommending improvements to the following: (1) the
terminology of a new test case through language modeling, (2) po-
tentially missing test steps for a new test case through frequent
itemset and association rule mining, and (3) recommendation of
similar test cases that already exist in the test suite through text em-
bedding and clustering. We thoroughly evaluated the three modules
on data from our industry partner. Our framework can provide ac-
tionable recommendations, which is an important challenge given
the widespread occurrence of test cases that are described only in
natural language in the software industry (in particular, the game
industry).},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
ing often remains a manual, and costly, activity in many industries.
Manual test cases, often described only in natural language, con-
sist of one or more test steps, which are instructions that must be
performed to achieve the testing objective. Having different em-
ployees specifying test cases might result in redundant, unclear, or
incomplete test cases. Manually reviewing and validating newly-
specified test cases is time-consuming and becomes impractical in a
scenario with a large test suite. Therefore, in this paper, we propose
an automated framework to automatically analyze test cases that
are specified in natural language and provide actionable recommen-
dations on how to improve the test cases. Our framework consists
of configurable components and modules for analysis, which are
capable of recommending improvements to the following: (1) the
terminology of a new test case through language modeling, (2) po-
tentially missing test steps for a new test case through frequent
itemset and association rule mining, and (3) recommendation of
similar test cases that already exist in the test suite through text em-
bedding and clustering. We thoroughly evaluated the three modules
on data from our industry partner. Our framework can provide ac-
tionable recommendations, which is an important challenge given
the widespread occurrence of test cases that are described only in
natural language in the software industry (in particular, the game
industry).
Markos Viggiato; Dale Paas; Chris Buzon; Cor-Paul Bezemer
Identifying Similar Test Cases That Are Specified in Natural Language Journal Article
Transactions of Software Engineering (TSE), 2022.
Abstract | BibTeX | Tags: Game development, Testing
@article{ViggiatoTSE2022,
title = {Identifying Similar Test Cases That Are Specified in Natural Language},
author = {Markos Viggiato and Dale Paas and Chris Buzon and Cor-Paul Bezemer},
year = {2022},
date = {2022-04-21},
urldate = {2022-04-21},
journal = {Transactions of Software Engineering (TSE)},
abstract = {Software testing is still a manual process in many industries, despite the recent improvements in automated testing
techniques. As a result, test cases (which consist of one or more test steps that need to be executed manually by the tester) are often
specified in natural language by different employees and many redundant test cases might exist in the test suite. This increases the
(already high) cost of test execution. Manually identifying similar test cases is a time-consuming and error-prone task. Therefore, in this
paper, we propose an unsupervised approach to identify similar test cases. Our approach uses a combination of text embedding, text
similarity and clustering techniques to identify similar test cases. We evaluate five different text embedding techniques, two text
similarity metrics, and two clustering techniques to cluster similar test steps and three techniques to identify similar test cases from the
test step clusters. Through an evaluation in an industrial setting, we showed that our approach achieves a high performance to cluster
test steps (an F-score of 87.39%) and identify similar test cases (an F-score of 83.47%). Furthermore, a validation with developers
indicates several different practical usages of our approach (such as identifying redundant test cases), which help to reduce the testing
manual effort and time.},
keywords = {Game development, Testing},
pubstate = {published},
tppubtype = {article}
}
techniques. As a result, test cases (which consist of one or more test steps that need to be executed manually by the tester) are often
specified in natural language by different employees and many redundant test cases might exist in the test suite. This increases the
(already high) cost of test execution. Manually identifying similar test cases is a time-consuming and error-prone task. Therefore, in this
paper, we propose an unsupervised approach to identify similar test cases. Our approach uses a combination of text embedding, text
similarity and clustering techniques to identify similar test cases. We evaluate five different text embedding techniques, two text
similarity metrics, and two clustering techniques to cluster similar test steps and three techniques to identify similar test cases from the
test step clusters. Through an evaluation in an industrial setting, we showed that our approach achieves a high performance to cluster
test steps (an F-score of 87.39%) and identify similar test cases (an F-score of 83.47%). Furthermore, a validation with developers
indicates several different practical usages of our approach (such as identifying redundant test cases), which help to reduce the testing
manual effort and time.
Mikael Sabuhi; Petr Musilek; Cor-Paul Bezemer
Studying the Performance Risks of Upgrading Docker Hub Images: A Case Study of WordPress Inproceedings
ACM/SPEC International Conference on Performance Engineering (ICPE), 2022.
@inproceedings{SabuhiICPE2022,
title = {Studying the Performance Risks of Upgrading Docker Hub Images: A Case Study of WordPress},
author = {Mikael Sabuhi and Petr Musilek and Cor-Paul Bezemer},
year = {2022},
date = {2022-04-09},
booktitle = {ACM/SPEC International Conference on Performance Engineering (ICPE)},
abstract = {The Docker Hub repository contains Docker images of applica-
tions, which allow users to do in-place upgrades to benefit from
the latest released features and security patches. However, prior
work showed that upgrading a Docker image not only changes
the main application, but can also change many dependencies. In
this paper, we present a methodology to study the performance im-
pact of upgrading the Docker Hub image of an application, thereby
focusing on changes to dependencies. We demonstrate our method-
ology through a case study of 90 official images of the WordPress
application. Our study shows that Docker image users should be
cautious and conduct a performance test before upgrading to a
newer Docker image in most cases. Our methodology can assist
them to better understand the performance risks of such upgrades,
and helps them to decide how thorough such a performance test
should be.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
tions, which allow users to do in-place upgrades to benefit from
the latest released features and security patches. However, prior
work showed that upgrading a Docker image not only changes
the main application, but can also change many dependencies. In
this paper, we present a methodology to study the performance im-
pact of upgrading the Docker Hub image of an application, thereby
focusing on changes to dependencies. We demonstrate our method-
ology through a case study of 90 official images of the WordPress
application. Our study shows that Docker image users should be
cautious and conduct a performance test before upgrading to a
newer Docker image in most cases. Our methodology can assist
them to better understand the performance risks of such upgrades,
and helps them to decide how thorough such a performance test
should be.
Mohammad Reza Taesiri; Finlay Macklon; Cor-Paul Bezemer
CLIP meets GamePhysics: Towards bug identification in gameplay videos using zero-shot transfer learning Inproceedings
International Conference on Mining Software Repositories (MSR), 2022.
Abstract | BibTeX | Tags: Bug report, Computer games, Game development, Gameplay videos, Gaming
@inproceedings{TaesiriMSR2022,
title = {CLIP meets GamePhysics: Towards bug identification in gameplay videos using zero-shot transfer learning},
author = {Mohammad Reza Taesiri and Finlay Macklon and Cor-Paul Bezemer},
year = {2022},
date = {2022-03-24},
urldate = {2022-03-24},
booktitle = {International Conference on Mining Software Repositories (MSR)},
abstract = {Gameplay videos contain rich information about how players inter-
act with the game and how the game responds. Sharing gameplay
videos on social media platforms, such as Reddit, has become a
common practice for many players. Often, players will share game-
play videos that showcase video game bugs. Such gameplay videos
are software artifacts that can be utilized for game testing, as they
provide insight for bug analysis. Although large repositories of
gameplay videos exist, parsing and mining them in an effective and
structured fashion has still remained a big challenge. In this paper,
we propose a search method that accepts any English text query as
input to retrieve relevant videos from large repositories of gameplay
videos. Our approach does not rely on any external information
(such as video metadata); it works solely based on the content of the
video. By leveraging the zero-shot transfer capabilities of the Con-
trastive Language-Image Pre-Training (CLIP) model, our approach
does not require any data labeling or training. To evaluate our ap-
proach, we present the GamePhysics dataset consisting of 26,954
videos from 1,873 games, that were collected from the GamePhysics
section on the Reddit website. Our approach shows promising re-
sults in our extensive analysis of simple queries, compound queries,
and bug queries, indicating that our approach is useful for object
and event detection in gameplay videos. An example application
of our approach is as a gameplay video search engine to aid in
reproducing video game bugs. Please visit the following link for the
code and the data: https://asgaardlab.github.io/CLIPxGamePhysics/},
keywords = {Bug report, Computer games, Game development, Gameplay videos, Gaming},
pubstate = {published},
tppubtype = {inproceedings}
}
act with the game and how the game responds. Sharing gameplay
videos on social media platforms, such as Reddit, has become a
common practice for many players. Often, players will share game-
play videos that showcase video game bugs. Such gameplay videos
are software artifacts that can be utilized for game testing, as they
provide insight for bug analysis. Although large repositories of
gameplay videos exist, parsing and mining them in an effective and
structured fashion has still remained a big challenge. In this paper,
we propose a search method that accepts any English text query as
input to retrieve relevant videos from large repositories of gameplay
videos. Our approach does not rely on any external information
(such as video metadata); it works solely based on the content of the
video. By leveraging the zero-shot transfer capabilities of the Con-
trastive Language-Image Pre-Training (CLIP) model, our approach
does not require any data labeling or training. To evaluate our ap-
proach, we present the GamePhysics dataset consisting of 26,954
videos from 1,873 games, that were collected from the GamePhysics
section on the Reddit website. Our approach shows promising re-
sults in our extensive analysis of simple queries, compound queries,
and bug queries, indicating that our approach is useful for object
and event detection in gameplay videos. An example application
of our approach is as a gameplay video search engine to aid in
reproducing video game bugs. Please visit the following link for the
code and the data: https://asgaardlab.github.io/CLIPxGamePhysics/
Simon Eismann; Diego Costa; Lizhi Liao; Cor-Paul Bezemer; Weiyi Shang; André van Hoorn; Samuel Kounev
A Case Study on the Stability of Performance Tests for Serverless Applications Journal Article
Journal of Systems and Software, 2022.
Abstract | BibTeX | Tags: Performance engineering, Performance regressions, Performance testing, Serverless
@article{EismannJSS2022,
title = {A Case Study on the Stability of Performance Tests for Serverless Applications},
author = {Simon Eismann and Diego Costa and Lizhi Liao and Cor-Paul Bezemer and Weiyi Shang and André van Hoorn and Samuel Kounev},
year = {2022},
date = {2022-03-17},
urldate = {2022-03-17},
journal = {Journal of Systems and Software},
abstract = {Context. While in serverless computing, application resource management and operational concerns are generally delegated to the cloud provider, ensuring that serverless applications meet their performance requirements is still a responsibility of the developers. Performance testing is a commonly used performance assessment practice; however, it traditionally requires visibility of the resource environment.
Objective. In this study, we investigate whether performance tests of serverless applications are stable, that is, if their results are reproducible, and what implications the serverless paradigm has for performance tests.
Method. We conduct a case study where we collect two datasets of performance test results: (a) repetitions of performance tests for varying memory size and load intensities and (b) three repetitions of the same performance test every day for ten months.
Results. We find that performance tests of serverless applications are comparatively stable if conducted on the same day. However, we also observe short-term performance variations and frequent long-term performance changes.
Conclusion. Performance tests for serverless applications can be stable; however, the serverless model impacts the planning, execution, and analysis of performance tests.},
keywords = {Performance engineering, Performance regressions, Performance testing, Serverless},
pubstate = {published},
tppubtype = {article}
}
Objective. In this study, we investigate whether performance tests of serverless applications are stable, that is, if their results are reproducible, and what implications the serverless paradigm has for performance tests.
Method. We conduct a case study where we collect two datasets of performance test results: (a) repetitions of performance tests for varying memory size and load intensities and (b) three repetitions of the same performance test every day for ten months.
Results. We find that performance tests of serverless applications are comparatively stable if conducted on the same day. However, we also observe short-term performance variations and frequent long-term performance changes.
Conclusion. Performance tests for serverless applications can be stable; however, the serverless model impacts the planning, execution, and analysis of performance tests.
Luisa Palechor; Cor-Paul Bezemer
How are Solidity smart contracts tested in open source projects? An exploratory study Inproceedings
3rd IEEE/ACM International Conference on Automation of Software Test (AST), 2022.
Abstract | BibTeX | Tags: Smart contracts, Testing
@inproceedings{PalechorAST2022,
title = {How are Solidity smart contracts tested in open source projects? An exploratory study},
author = {Luisa Palechor and Cor-Paul Bezemer},
year = {2022},
date = {2022-03-10},
urldate = {2022-03-10},
booktitle = {3rd IEEE/ACM International Conference on Automation of Software Test (AST)},
abstract = {Smart contracts are self-executing programs that are stored on the
blockchain. Once a smart contract is compiled and deployed on
the blockchain, it cannot be modified. Therefore, having a bug-
free smart contract is vital. To ensure a bug-free smart contract,
it must be tested thoroughly. However, little is known about how
developers test smart contracts in practice. Our study explores 139
open source smart contract projects that are written in Solidity
to investigate the state of smart contract testing from three di-
mensions: (1) the developers working on the tests, (2) the used
testing frameworks and testnets and (3) the type of tests that are
conducted. We found that mostly core developers of a project are
responsible for testing the contracts. Second, developers typically
use only functional testing frameworks to test a smart contract,
with Truffle being the most popular one. Finally, our results show
that functional testing is conducted in most of the studied projects
(93%), security testing is only performed in a few projects (9.4%) and
traditional performance testing is conducted in none. In addition,
we found 25 projects that mentioned or published external audit
reports.},
keywords = {Smart contracts, Testing},
pubstate = {published},
tppubtype = {inproceedings}
}
blockchain. Once a smart contract is compiled and deployed on
the blockchain, it cannot be modified. Therefore, having a bug-
free smart contract is vital. To ensure a bug-free smart contract,
it must be tested thoroughly. However, little is known about how
developers test smart contracts in practice. Our study explores 139
open source smart contract projects that are written in Solidity
to investigate the state of smart contract testing from three di-
mensions: (1) the developers working on the tests, (2) the used
testing frameworks and testnets and (3) the type of tests that are
conducted. We found that mostly core developers of a project are
responsible for testing the contracts. Second, developers typically
use only functional testing frameworks to test a smart contract,
with Truffle being the most popular one. Finally, our results show
that functional testing is conducted in most of the studied projects
(93%), security testing is only performed in a few projects (9.4%) and
traditional performance testing is conducted in none. In addition,
we found 25 projects that mentioned or published external audit
reports.
Hao Li; Filipe R. Cogo; Cor-Paul Bezemer
An Empirical Study of Yanked Releases in the Rust Package Registry Journal Article
Transactions of Software Engineering (TSE), 2022.
Abstract | BibTeX | Tags: Release Management, Software Ecosystem
@article{LiTSE2022,
title = {An Empirical Study of Yanked Releases in the Rust Package Registry},
author = {Hao Li and Filipe R. Cogo and Cor-Paul Bezemer},
year = {2022},
date = {2022-02-14},
urldate = {2022-02-14},
journal = {Transactions of Software Engineering (TSE)},
abstract = {Cargo, the software packaging manager of Rust, provides a yank mechanism to support release-level deprecation, which
can prevent packages from depending on yanked releases. Most prior studies focused on code-level (i.e., deprecated APIs) and
package-level deprecation (i.e., deprecated packages). However, few studies have focused on release-level deprecation. In this study,
we investigate how often and how the yank mechanism is used, the rationales behind its usage, and the adoption of yanked releases in
the Cargo ecosystem. Our study shows that 9.6% of the packages in Cargo have at least one yanked release, and the proportion of
yanked releases kept increasing from 2014 to 2020. Package owners yank releases for other reasons than withdrawing a defective
release, such as fixing a release that does not follow semantic versioning or indicating a package is removed or replaced. In addition,
we found that 46% of the packages directly adopted at least one yanked release and the yanked releases propagated through the
dependency network, which leads to 1.4% of the releases in the ecosystem having unresolved dependencies.},
keywords = {Release Management, Software Ecosystem},
pubstate = {published},
tppubtype = {article}
}
can prevent packages from depending on yanked releases. Most prior studies focused on code-level (i.e., deprecated APIs) and
package-level deprecation (i.e., deprecated packages). However, few studies have focused on release-level deprecation. In this study,
we investigate how often and how the yank mechanism is used, the rationales behind its usage, and the adoption of yanked releases in
the Cargo ecosystem. Our study shows that 9.6% of the packages in Cargo have at least one yanked release, and the proportion of
yanked releases kept increasing from 2014 to 2020. Package owners yank releases for other reasons than withdrawing a defective
release, such as fixing a release that does not follow semantic versioning or indicating a package is removed or replaced. In addition,
we found that 46% of the packages directly adopted at least one yanked release and the yanked releases propagated through the
dependency network, which leads to 1.4% of the releases in the ecosystem having unresolved dependencies.
Mikael Sabuhi; Ming (Chloe) Zhou; Cor-Paul Bezemer; Petr Musilek
Applications of Generative Adversarial Networks in Anomaly Detection: A Systematic Literature Review Journal Article
IEEE Access, 2021.
@article{mikael_gan2021,
title = {Applications of Generative Adversarial Networks in Anomaly Detection: A Systematic Literature Review},
author = {Mikael Sabuhi and Ming (Chloe) Zhou and Cor-Paul Bezemer and Petr Musilek},
year = {2021},
date = {2021-12-01},
urldate = {2021-12-01},
journal = {IEEE Access},
abstract = {Anomaly detection has become an indispensable tool for modern society, applied in a wide
range of applications, from detecting fraudulent transactions to malignant brain tumors. Over time, many
anomaly detection techniques have been introduced. However, in general, they all suffer from the same
problem: lack of data that represents anomalous behaviour. As anomalous behaviour is usually costly (or
dangerous) for a system, it is difficult to gather enough data that represents such behaviour. This, in turn,
makes it difficult to develop and evaluate anomaly detection techniques. Recently, generative adversarial
networks (GANs) have attracted much attention in anomaly detection research, due to their unique ability
to generate new data. In this paper, we present a systematic review of the literature in this area, covering
128 papers. The goal of this review paper is to analyze the relation between anomaly detection techniques
and types of GANs, to identify the most common application domains for GAN-assisted and GAN-based
anomaly detection, and to assemble information on datasets and performance metrics used to assess them.
Our study helps researchers and practitioners to find the most suitable GAN-assisted anomaly detection
technique for their application. In addition, we present a research roadmap for future studies in this area. In
summary, GANs are used in anomaly detection to address the problem of insufficient amount of data for the
anomalous behaviour, either through data augmentation or representation learning. The most commonly used
GAN architectures are DCGANs, standard GANs, and cGANs. The primary application domains include
medicine, surveillance and intrusion detection.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
range of applications, from detecting fraudulent transactions to malignant brain tumors. Over time, many
anomaly detection techniques have been introduced. However, in general, they all suffer from the same
problem: lack of data that represents anomalous behaviour. As anomalous behaviour is usually costly (or
dangerous) for a system, it is difficult to gather enough data that represents such behaviour. This, in turn,
makes it difficult to develop and evaluate anomaly detection techniques. Recently, generative adversarial
networks (GANs) have attracted much attention in anomaly detection research, due to their unique ability
to generate new data. In this paper, we present a systematic review of the literature in this area, covering
128 papers. The goal of this review paper is to analyze the relation between anomaly detection techniques
and types of GANs, to identify the most common application domains for GAN-assisted and GAN-based
anomaly detection, and to assemble information on datasets and performance metrics used to assess them.
Our study helps researchers and practitioners to find the most suitable GAN-assisted anomaly detection
technique for their application. In addition, we present a research roadmap for future studies in this area. In
summary, GANs are used in anomaly detection to address the problem of insufficient amount of data for the
anomalous behaviour, either through data augmentation or representation learning. The most commonly used
GAN architectures are DCGANs, standard GANs, and cGANs. The primary application domains include
medicine, surveillance and intrusion detection.
Arthur V. Kamienski
Studying Trends, Topics, and Duplicate Questions on Q&A Websites for Game Developers Masters Thesis
University of Alberta, 2021.
Abstract | BibTeX | Tags: Computer games, Q&A websites
@mastersthesis{msc_arthur,
title = {Studying Trends, Topics, and Duplicate Questions on Q&A Websites for Game Developers},
author = {Arthur V. Kamienski},
year = {2021},
date = {2021-09-29},
urldate = {2021-09-29},
school = {University of Alberta},
abstract = {The game development industry is growing and there is a high demand for develop-
ers that can produce high-quality games. These developers need resources to learn
and improve the skills required to build those games in a reliable and easy manner.
Question and Answer (Q&A) websites are learning resources that are commonly used
by software developers to share knowledge and acquire the information they need.
However, we still know little about how game developers use and interact with Q&A
websites. In this thesis, we analyze the largest Q&A websites that discuss game de-
velopment to understand how effective they are as learning resources and what can
be improved to build a better Q&A community for their users.
In the first part of this thesis, we analyzed data collected from four Q&A websites,
namely Unity Answers, the Unreal Engine 4 (UE4) AnswerHub, the Game Develop-
ment Stack Exchange, and Stack Overflow, to assess their effectiveness in helping
game developers. We also used the 347 responses collected from a survey we ran
with game developers to gauge their perception of Q&A websites. We found that
the studied websites are in decline, with their activity and effectiveness decreasing
over the last few years and users having an overall negative view of the studied Q&A
communities. We also characterized the topics discussed in those websites using a
latent Dirichlet allocation (LDA) model, and analyze how those topics differ across
websites. Finally, we give recommendations to guide developers to the websites that
are most effective in answering the types of questions they have, which could help the
websites in overcoming their decline.
In the second part of the thesis, we explored how we can further help Q&A web-
sites for game developers by automatically identifying duplicate questions. Duplicate
questions have a negative impact on Q&A websites by overloading them with ques-
tions that have already been answered. Therefore, we analyzed the performance of
seven unsupervised and pre-trained techniques on the task of detecting duplicate
questions on Q&A websites for game developers. We achieved the highest perfor-
mance when comparing all the text content of questions and their answers using a
pre-trained technique based on MPNet. Furthermore, we could almost double the
performance by combining all of the techniques into a single question similarity score
using supervised models. Lastly, we show that the supervised models can be used
on websites different from the ones they were trained on with little to no decrease in
performance. Our findings can be used by Q&A websites and future researchers to
build better systems for duplicate question detection, which can ultimately provide
game developers with better Q&A communities.},
keywords = {Computer games, Q&A websites},
pubstate = {published},
tppubtype = {mastersthesis}
}
ers that can produce high-quality games. These developers need resources to learn
and improve the skills required to build those games in a reliable and easy manner.
Question and Answer (Q&A) websites are learning resources that are commonly used
by software developers to share knowledge and acquire the information they need.
However, we still know little about how game developers use and interact with Q&A
websites. In this thesis, we analyze the largest Q&A websites that discuss game de-
velopment to understand how effective they are as learning resources and what can
be improved to build a better Q&A community for their users.
In the first part of this thesis, we analyzed data collected from four Q&A websites,
namely Unity Answers, the Unreal Engine 4 (UE4) AnswerHub, the Game Develop-
ment Stack Exchange, and Stack Overflow, to assess their effectiveness in helping
game developers. We also used the 347 responses collected from a survey we ran
with game developers to gauge their perception of Q&A websites. We found that
the studied websites are in decline, with their activity and effectiveness decreasing
over the last few years and users having an overall negative view of the studied Q&A
communities. We also characterized the topics discussed in those websites using a
latent Dirichlet allocation (LDA) model, and analyze how those topics differ across
websites. Finally, we give recommendations to guide developers to the websites that
are most effective in answering the types of questions they have, which could help the
websites in overcoming their decline.
In the second part of the thesis, we explored how we can further help Q&A web-
sites for game developers by automatically identifying duplicate questions. Duplicate
questions have a negative impact on Q&A websites by overloading them with ques-
tions that have already been answered. Therefore, we analyzed the performance of
seven unsupervised and pre-trained techniques on the task of detecting duplicate
questions on Q&A websites for game developers. We achieved the highest perfor-
mance when comparing all the text content of questions and their answers using a
pre-trained technique based on MPNet. Furthermore, we could almost double the
performance by combining all of the techniques into a single question similarity score
using supervised models. Lastly, we show that the supervised models can be used
on websites different from the ones they were trained on with little to no decrease in
performance. Our findings can be used by Q&A websites and future researchers to
build better systems for duplicate question detection, which can ultimately provide
game developers with better Q&A communities.
Arthur V. Kamienski; Cor-Paul Bezemer
An Empirical Study of Q&A Websites for Game Developers Journal Article
Empirical Software Engineering Journal (EMSE), 2021.
Abstract | BibTeX | Tags: Game development, Q&A communities
@article{arthur2021,
title = {An Empirical Study of Q&A Websites for Game Developers},
author = {Arthur V. Kamienski and Cor-Paul Bezemer},
year = {2021},
date = {2021-07-07},
urldate = {2021-07-07},
journal = {Empirical Software Engineering Journal (EMSE)},
abstract = {The game development industry is growing, and training new developers in game development-specific abilities is essential to satisfying its need for skilled game developers. These developers require effective learning resources to acquire the information they need and improve their game development skills. Question and Answer (Q&A) websites stand out as some of the most used online learning resources in software development. Many studies have investigated how Q&A websites help software developers become more experienced. However, no studies have explored Q&A websites aimed at game development, and there is little information about how game developers use and interact with these websites. In this paper, we study four Q&A communities by analyzing game development data we collected from their websites and the 347 responses received on a survey we ran with game developers. We observe that the communities have declined over the past few years and identify factors that correlate to these changes. Using a Latent Dirichlet Allocation (LDA) model, we characterize the topics discussed in the communities. We also analyze how topics differ across communities and identify the most discussed topics. Furthermore, we find that survey respondents have a mostly negative view of the communities and tended to stop using the websites once they became more experienced. Finally, we provide recommendations on where game developers should post their questions, which can help mitigate the websites’ declines and improve their effectiveness.},
keywords = {Game development, Q&A communities},
pubstate = {published},
tppubtype = {article}
}
Quang N. Vu; Cor-Paul Bezemer
Improving the Discoverability of Indie Games by Leveraging their Similarity to Top-Selling Games Identifying Important Requirements of a Recommender System Inproceedings
International Conference on the Foundations of Digital Games (FDG), pp. 1–12, 2021.
Abstract | BibTeX | Tags: Computer games, Game discoverability, Indie games, itch.io, Steam
@inproceedings{Quang21,
title = {Improving the Discoverability of Indie Games by Leveraging their Similarity to Top-Selling Games Identifying Important Requirements of a Recommender System},
author = {Quang N. Vu and Cor-Paul Bezemer},
year = {2021},
date = {2021-04-07},
urldate = {2021-04-07},
booktitle = {International Conference on the Foundations of Digital Games (FDG)},
pages = {1--12},
abstract = {Indie games often lack visibility as compared to top-selling games due to their limited marketing budget and the fact that there are a large number of indie games. Players of top-selling games usually like certain types of games or certain game elements such as theme, gameplay, storyline. Therefore, indie games could leverage their shared game elements with top-selling games to get discovered. In this paper, we propose an approach to improve the discoverability of indie games by recommending similar indie games to gamers of top-selling games. We first matched 2,830 itch.io indie games to 326 top-selling Steam games. We then contacted the indie game
developers for evaluation feedback and suggestions. We found that the majority of them (67.9%) who offered verbose responses show positive support for our approach.We also analyzed the reasons for bad recommendations and the suggestions by indie game developers to lay out the important requirements for such a recommendation system. The most important ones are: a standardized and extensive tag and genre ontology system is needed to bridge the two platforms, the expectations of players of top-selling games should be managed to avoid disappointment, a player’s preferences should be integrated when making recommendations, a standardized age restriction rule is needed, and finally, the recommendation tool should also show indie games that are the least similar or less popular.},
keywords = {Computer games, Game discoverability, Indie games, itch.io, Steam},
pubstate = {published},
tppubtype = {inproceedings}
}
developers for evaluation feedback and suggestions. We found that the majority of them (67.9%) who offered verbose responses show positive support for our approach.We also analyzed the reasons for bad recommendations and the suggestions by indie game developers to lay out the important requirements for such a recommendation system. The most important ones are: a standardized and extensive tag and genre ontology system is needed to bridge the two platforms, the expectations of players of top-selling games should be managed to avoid disappointment, a player’s preferences should be integrated when making recommendations, a standardized age restriction rule is needed, and finally, the recommendation tool should also show indie games that are the least similar or less popular.
Filipe R. Cogo; Gustavo A. Oliva; Cor-Paul Bezemer; Ahmed E. Hassan
An Empirical Study of Same-day Releases of Popular Packages in the npm Ecosystem Journal Article
Empirical Software Engineering Journal (EMSE), 2021.
Abstract | BibTeX | Tags: Dependencies, Release Management, Same-day Release, Software Ecosystem
@article{cogo2021,
title = {An Empirical Study of Same-day Releases of Popular Packages in the npm Ecosystem},
author = {Filipe R. Cogo and Gustavo A. Oliva and Cor-Paul Bezemer and Ahmed E. Hassan},
year = {2021},
date = {2021-04-05},
urldate = {2021-04-05},
journal = {Empirical Software Engineering Journal (EMSE)},
abstract = {Within a software ecosystem, client packages can reuse provider
packages as third-party libraries. The reuse relation between client and provider packages is called a dependency. When a client package depends on the code of a provider package, every change that is introduced in a release of the provider has the potential to impact the client package. Since a large number of dependencies exist within a software ecosystem, releases of a popular provider package can impact a large number of clients. Occasionally, multiple releases of a popular package need to be published on the same day, leading to a scenario in which the time available to revise, test, build, and document the release is restricted compared to releases published within a regular schedule. In this paper, our objective is to study the same-day releases that are published by popular packages in the npm ecosystem. We design an exploratory study to characterize the type of changes that are introduced in same-day releases, the prevalence of same-day releases in the npm ecosystem, and the adoption of same-day releases by client packages. A preliminary manual analysis of the existing release notes suggests that same-day releases introduce non-trivial changes (e.g., bug fixes). We then focus on three RQs. First, we study how often same-day releases are published. We found that the median proportion of regularly scheduled releases that are interrupted by a same-day release (per popular package) is 22%, suggesting the importance of having timely and systematic procedures to cope with same-day releases. Second, we
study the performed code changes in same-day releases. We observe that 32% of the same-day releases have larger changes compared with their prior release, thus showing that some same-day releases can undergo significant maintenance activity despite their time-constrained nature. In our third RQ, we study how client packages react to same-day releases of their providers. We observe the vast majority of client packages that adopt the release preceding the same-day release would also adopt the latter without having to change their versioning statement (implicit updates). We also note that explicit adoptions of sameday releases (i.e., adoptions that require a change to the versioning statement of the provider in question) is significantly faster than the explicit adoption of regular releases. Based on our findings, we argue that (i) third-party tools that support the automation of dependency management (e.g., Dependabot) should consider explicitly flagging same-day releases, (ii) popular packages should strive for optimized release pipelines that can properly handle same-day releases, and (iii) future research should design scalable, ecosystem-ready tools that support provider packages in assessing the impact of their code changes on client packages.},
keywords = {Dependencies, Release Management, Same-day Release, Software Ecosystem},
pubstate = {published},
tppubtype = {article}
}
packages as third-party libraries. The reuse relation between client and provider packages is called a dependency. When a client package depends on the code of a provider package, every change that is introduced in a release of the provider has the potential to impact the client package. Since a large number of dependencies exist within a software ecosystem, releases of a popular provider package can impact a large number of clients. Occasionally, multiple releases of a popular package need to be published on the same day, leading to a scenario in which the time available to revise, test, build, and document the release is restricted compared to releases published within a regular schedule. In this paper, our objective is to study the same-day releases that are published by popular packages in the npm ecosystem. We design an exploratory study to characterize the type of changes that are introduced in same-day releases, the prevalence of same-day releases in the npm ecosystem, and the adoption of same-day releases by client packages. A preliminary manual analysis of the existing release notes suggests that same-day releases introduce non-trivial changes (e.g., bug fixes). We then focus on three RQs. First, we study how often same-day releases are published. We found that the median proportion of regularly scheduled releases that are interrupted by a same-day release (per popular package) is 22%, suggesting the importance of having timely and systematic procedures to cope with same-day releases. Second, we
study the performed code changes in same-day releases. We observe that 32% of the same-day releases have larger changes compared with their prior release, thus showing that some same-day releases can undergo significant maintenance activity despite their time-constrained nature. In our third RQ, we study how client packages react to same-day releases of their providers. We observe the vast majority of client packages that adopt the release preceding the same-day release would also adopt the latter without having to change their versioning statement (implicit updates). We also note that explicit adoptions of sameday releases (i.e., adoptions that require a change to the versioning statement of the provider in question) is significantly faster than the explicit adoption of regular releases. Based on our findings, we argue that (i) third-party tools that support the automation of dependency management (e.g., Dependabot) should consider explicitly flagging same-day releases, (ii) popular packages should strive for optimized release pipelines that can properly handle same-day releases, and (iii) future research should design scalable, ecosystem-ready tools that support provider packages in assessing the impact of their code changes on client packages.
Markos Viggiato; Dayi Lin; Abram Hindle; Cor-Paul Bezemer
What Causes Wrong Sentiment Classifications of Game Reviews? Journal Article
IEEE Transactions on Games, pp. 1–14, 2021.
Abstract | BibTeX | Tags: Computer games, Natural language processing, Sentiment analysis, Steam
@article{markos2021sentiment,
title = {What Causes Wrong Sentiment Classifications of Game Reviews?},
author = {Markos Viggiato and Dayi Lin and Abram Hindle and Cor-Paul Bezemer},
year = {2021},
date = {2021-04-05},
urldate = {2021-04-05},
journal = {IEEE Transactions on Games},
pages = {1--14},
institution = {University of Alberta},
abstract = {Sentiment analysis is a popular technique to identify the sentiment of a piece of text. Several different domains have been targeted by sentiment analysis research, such as Twitter, movie reviews, and mobile app reviews. Although several techniques have been proposed, the performance of current sentiment analysis techniques are still far from acceptable, mainly when applied in domains on which they were not trained. In addition, the causes of wrong classifications are not clear. In this paper, we study how sentiment analysis performs on game reviews. We first report the results of a large scale empirical study on the performance of widely-used sentiment classifiers on game reviews. Then, we investigate the root causes for the wrong classifications and quantify the impact of each cause on the overall performance. We study three existing classifiers: Stanford CoreNLP, NLTK, and SentiStrength. Our results show that most classifiers do not perform well on game reviews, with the best one being NLTK (with an AUC of 0.70). We also identified four main causes for wrong classifications, such as reviews that point out advantages and disadvantages of the game, which might confuse the classifier. The identified causes are not trivial to be resolved and we call upon sentiment analysis and game researchers and developers to prioritize a research agenda that investigates how the performance of sentiment analysis of game reviews can be improved, for instance by developing techniques that can automatically deal with specific game-related issues of reviews (e.g., reviews with advantages and disadvantages). Finally, we show that training sentiment classifiers on reviews that are stratified by the game genre is effective.},
keywords = {Computer games, Natural language processing, Sentiment analysis, Steam},
pubstate = {published},
tppubtype = {article}
}
Arthur V. Kamienski; Luisa Palechor; Cor-Paul Bezemer; Abram Hindle
PySStuBs: Characterizing Single-Statement Bugs in Popular Open-Source Python Projects Inproceedings
MSR Mining Challenge, pp. 1–5, 2021.
Abstract | BibTeX | Tags: Open-source projects, Python, Single-statement bugs
@inproceedings{athur2021pysstubs,
title = {PySStuBs: Characterizing Single-Statement Bugs in Popular Open-Source Python Projects},
author = {Arthur V. Kamienski and Luisa Palechor and Cor-Paul Bezemer and Abram Hindle},
year = {2021},
date = {2021-03-08},
urldate = {2021-03-08},
booktitle = {MSR Mining Challenge},
pages = {1--5},
abstract = {Single-statement bugs (SStuBs) can have a severe impact on developer productivity. Despite usually being simple and not offering much of a challenge to fix, these bugs may still disturb a developer’s workflow and waste precious development time. However, few studies have paid attention to these simple bugs, focusing instead on bugs of any size and complexity. In this study, we explore the occurrence of SStuBs in some of the most popular open-source Python projects on GitHub, while also characterizing their patterns and distribution. We further compare these bugs to SStuBs found in a previous study on Java Maven projects. We find that these Python projects have different SStuB patterns than the ones in Java Maven projects and identify 7 new SStuB patterns. Our results may help uncover the importance of understanding these bugs for the Python programming language, and how developers can handle them more effectively.},
keywords = {Open-source projects, Python, Single-statement bugs},
pubstate = {published},
tppubtype = {inproceedings}
}
Rain Epp; Dayi Lin; Cor-Paul Bezemer
An Empirical Study of Trends of Popular Virtual Reality Games and Their Complaints Journal Article
IEEE Transactions on Games, pp. 1–12, 2021.
Abstract | BibTeX | Tags: Gamer complaints, Virtual reality games
@article{rain2021vr,
title = {An Empirical Study of Trends of Popular Virtual Reality Games and Their Complaints},
author = {Rain Epp and Dayi Lin and Cor-Paul Bezemer},
year = {2021},
date = {2021-01-29},
urldate = {2021-01-29},
journal = {IEEE Transactions on Games},
pages = {1--12},
institution = {University of Alberta},
abstract = {The market for virtual reality (VR) games is growing rapidly, and is expected to grow from $3.3B in 2018 to $13.7B in 2022. Due to the immersive nature of such games and the use of VR headsets, players may have complaints about VR games which are distinct from those about traditional computer games, and an understanding of those complaints could enable developers to better take advantage of the growing VR market. We conduct an empirical study of 750 popular VR games and 17,635 user reviews on Steam in order to understand trends in VR games and their complaints. We find that the VR games market is maturing. Fewer VR games are released each month but their quality appears to be improving over time. Most games support multiple headsets and play areas, and support for smaller-scale play areas is increasing. Complaints of cybersickness are rare and declining, indicating that players are generally more concerned with other issues. Recently, complaints about game-specific issues have become the most frequent type of complaint, and VR game developers can now focus on these issues and worry less about VR-comfort issues such as cybersickness.},
keywords = {Gamer complaints, Virtual reality games},
pubstate = {published},
tppubtype = {article}
}
Sara Gholami; Hamzeh Khazaei; Cor-Paul Bezemer
Should you Upgrade Official Docker Hub Images in Production Environments? Inproceedings
ICSE New Ideas and Emerging Results (NIER), pp. 1–5, 2021.
Abstract | BibTeX | Tags: Containerization, Dependency upgrades, Docker, Docker Hub, Downgrades
@inproceedings{sara2021icsenier,
title = {Should you Upgrade Official Docker Hub Images in Production Environments?},
author = {Sara Gholami and Hamzeh Khazaei and Cor-Paul Bezemer},
year = {2021},
date = {2021-01-29},
urldate = {2021-01-29},
booktitle = {ICSE New Ideas and Emerging Results (NIER)},
pages = {1--5},
abstract = {Docker, one of the most popular software containerization technologies, allows a user to deploy Docker images to create and run containers. While Docker images facilitate the deployment and in-place upgrading of an application in a production environment by replacing its container with one based on a newer image, many dependencies could change at once during such an image upgrade, which can potentially be a source of risk. In this paper, we study the official Docker images on Docker Hub and explore how packages are changing in these images. We found that the number of package changes varies across different types of applications and that often the changing packages are utility packages. Our study takes a first important look at potential risks when doing an in-place upgrade of a Docker image.},
keywords = {Containerization, Dependency upgrades, Docker, Docker Hub, Downgrades},
pubstate = {published},
tppubtype = {inproceedings}
}
Hareem Sahar; Abram Hindle; Cor-Paul Bezemer
How are Issue Reports Discussed in Gitter Chat Rooms? Journal Article
Journal of Systems and Software (JSS), pp. 1–53, 2020.
Abstract | BibTeX | Tags: Developer discussions, Gitter, Issue reports
@article{sahar2020JSS-Gitter-Issues,
title = {How are Issue Reports Discussed in Gitter Chat Rooms?},
author = {Hareem Sahar and Abram Hindle and Cor-Paul Bezemer},
year = {2020},
date = {2020-10-29},
urldate = {2020-10-29},
journal = {Journal of Systems and Software (JSS)},
pages = {1--53},
institution = {University of Alberta},
abstract = {Informal communication channels like mailing lists, IRC and instant messaging play a vital role in open source software development by facilitating communication within geographically diverse project teams e.g., to discuss issue reports to facilitate the bug-xing process. More recently, chat systems like Slack and Gitter have gained a lot of popularity and developers are rapidly adopting them. Gitter is a chat system that is specically designed to address the needs of GitHub users. Gitter hosts project-based asynchronous chats which foster frequent project discussions among participants. Developer discussions contain a wealth of information such as the rationale behind decisions made during the evolution of a project. In this study, we explore 24 open source project chat rooms that are hosted on Gitter, containing a total of 3,133,106 messages and 14,096 issue references. We manually analyze the contents of chat room discussions around 457 issue reports. The results of our study show the prevalence of issue discussions on Gitter, and that the discussed issue reports have a longer resolution time than the issue reports that are never brought on Gitter.},
keywords = {Developer discussions, Gitter, Issue reports},
pubstate = {published},
tppubtype = {article}
}