( = Paper PDF,
= Presentation slides,
= Presentation video)
Mohammad Reza Taesiri; Cor-Paul Bezemer
VIDEOGAMEBUNNY: Towards vision assistants for video games Inproceedings
Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, 2025.
Abstract | BibTeX | Tags: Computer games, Foundation models, Game development, Game testing
@inproceedings{Taesiri_VideoGameBunny,
title = {VIDEOGAMEBUNNY: Towards vision assistants for video games},
author = {Mohammad Reza Taesiri and Cor-Paul Bezemer },
year = {2025},
date = {2025-03-01},
booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
abstract = {Large multimodal models (LMMs) hold substantial
promise across various domains, from personal assistance
in daily tasks to sophisticated applications like medical di-
agnostics. However, their capabilities have limitations in
the video game domain, such as challenges with scene un-
derstanding, hallucinations, and inaccurate descriptions of
video game content, especially in open-source models. This
paper describes the development of VIDEOGAMEBUNNY,
a LLaVA-style model based on Bunny, specifically tailored
for understanding images from video games. We release
intermediate checkpoints, training logs, and an extensive
dataset comprising 185,259 video game images from 413
titles, along with 389,565 image-instruction pairs that in-
clude image captions, question-answer pairs, and a JSON
representation of 16 elements of 136,974 images. Our ex-
periments show that our high quality game-related data
has the potential to make a relatively small model outper-
form the much larger state-of-the-art model LLaVa-1.6-34b
(which has more than 4x the number of parameters). Our
study paves the way for future research in video game un-
derstanding on tasks such as playing, commentary, and
debugging. Code and data are available at: https://videogamebunny.github.io/},
keywords = {Computer games, Foundation models, Game development, Game testing},
pubstate = {published},
tppubtype = {inproceedings}
}
promise across various domains, from personal assistance
in daily tasks to sophisticated applications like medical di-
agnostics. However, their capabilities have limitations in
the video game domain, such as challenges with scene un-
derstanding, hallucinations, and inaccurate descriptions of
video game content, especially in open-source models. This
paper describes the development of VIDEOGAMEBUNNY,
a LLaVA-style model based on Bunny, specifically tailored
for understanding images from video games. We release
intermediate checkpoints, training logs, and an extensive
dataset comprising 185,259 video game images from 413
titles, along with 389,565 image-instruction pairs that in-
clude image captions, question-answer pairs, and a JSON
representation of 16 elements of 136,974 images. Our ex-
periments show that our high quality game-related data
has the potential to make a relatively small model outper-
form the much larger state-of-the-art model LLaVa-1.6-34b
(which has more than 4x the number of parameters). Our
study paves the way for future research in video game un-
derstanding on tasks such as playing, commentary, and
debugging. Code and data are available at: https://videogamebunny.github.io/
Mohammad Reza Taesiri
Leveraging Foundation Models for Video Game Quality Assurance PhD Thesis
2024.
Abstract | BibTeX | Tags: Computer games, Computer vision, Game development, Game testing, Gameplay videos, Machine learning, Software quality
@phdthesis{phd_taesiri,
title = {Leveraging Foundation Models for Video Game Quality Assurance},
author = {Mohammad Reza Taesiri },
year = {2024},
date = {2024-09-25},
abstract = {The video game industry has become a powerhouse in the global entertainment econ-
omy. Creating engaging, high-quality games demands intricate development processes
and significant resources. As projects grow in complexity and scale, developers often
grapple with demanding schedules, tight deadlines, and the risk of burnout. These
pressures highlight the need for more efficient development strategies, with quality
assurance (QA) emerging as a critical area for optimization.
Artificial Intelligence (AI) has the potential to address these challenges by en-
hancing the game QA processes in large gaming companies. Specifically, foundation
models—large pre-trained AI models—offer promising applications to improve these
processes. Exploring novel uses of these advanced AI models could reveal their poten-
tial and limitations in optimizing game development workflows, potentially alleviating
some of the industry’s pressing issues and facilitating the creation of high-quality, en-
gaging games.
In this thesis, my goal is to improve video game testing processes by leveraging
foundation models to ensure the final product reaches a desirable quality. I explore
new opportunities that foundation models bring to game testing, from searching for
instances of game bugs within video repositories to assisting human testers in catching
bugs, through three studies:
First, I investigate the utility of image-text foundation models in retrieving game-
play videos. In this study, I create a video search engine designed to help developers
efficiently search video repositories for examples of video game bugs using textual
descriptions. For example, developers can find all instances of a bug by using a tex-
tual description of the bug, such as a horse flying in the air. This study lays the
groundwork for AI-based game QA processes, with results demonstrating significant
potential.
Next, I introduce GlitchBench, a benchmarking dataset of video game glitches
and anomalies designed to assess state-of-the-art large multimodal models, such as
GPT-4V, in detecting and understanding game bugs. This extensive dataset includes
a wide range of images depicting various glitches, sourced from both online platforms
and synthetic sets created within the Unity game engine. GlitchBench includes both
common and rare glitches encountered in the video game quality assurance process.
The findings from this study highlight both the promise and limitations of existing
models, particularly in unusual and rare cases.
Lastly, I introduce VideoGameBunny, a large multimodal model specifically
trained for video game content, accompanied by a dataset of 389,565 image-instruction
pairs. My analysis demonstrates that VideoGameBunny outperforms much larger
models in video game understanding tasks while using 4.2× fewer parameters. This
result underscores the effectiveness and promise of using a high-quality dataset to
improve models’ understanding of video games, thus making them more effective in
the game QA process.
Future work should focus on enhancing the generalization and robustness of AI
models in the gaming context, particularly through better integration of vision and
language components. This integration could be achieved using either early or late fu-
sion methods. For late fusion methods, where two pre-trained models are connected,
better alignment between these components can be achieved through improved train-
ing data and strategies. Alternatively, early fusion techniques, which involve training
both components simultaneously to enhance their integration, can overcome many
issues that existing models have.},
keywords = {Computer games, Computer vision, Game development, Game testing, Gameplay videos, Machine learning, Software quality},
pubstate = {published},
tppubtype = {phdthesis}
}
omy. Creating engaging, high-quality games demands intricate development processes
and significant resources. As projects grow in complexity and scale, developers often
grapple with demanding schedules, tight deadlines, and the risk of burnout. These
pressures highlight the need for more efficient development strategies, with quality
assurance (QA) emerging as a critical area for optimization.
Artificial Intelligence (AI) has the potential to address these challenges by en-
hancing the game QA processes in large gaming companies. Specifically, foundation
models—large pre-trained AI models—offer promising applications to improve these
processes. Exploring novel uses of these advanced AI models could reveal their poten-
tial and limitations in optimizing game development workflows, potentially alleviating
some of the industry’s pressing issues and facilitating the creation of high-quality, en-
gaging games.
In this thesis, my goal is to improve video game testing processes by leveraging
foundation models to ensure the final product reaches a desirable quality. I explore
new opportunities that foundation models bring to game testing, from searching for
instances of game bugs within video repositories to assisting human testers in catching
bugs, through three studies:
First, I investigate the utility of image-text foundation models in retrieving game-
play videos. In this study, I create a video search engine designed to help developers
efficiently search video repositories for examples of video game bugs using textual
descriptions. For example, developers can find all instances of a bug by using a tex-
tual description of the bug, such as a horse flying in the air. This study lays the
groundwork for AI-based game QA processes, with results demonstrating significant
potential.
Next, I introduce GlitchBench, a benchmarking dataset of video game glitches
and anomalies designed to assess state-of-the-art large multimodal models, such as
GPT-4V, in detecting and understanding game bugs. This extensive dataset includes
a wide range of images depicting various glitches, sourced from both online platforms
and synthetic sets created within the Unity game engine. GlitchBench includes both
common and rare glitches encountered in the video game quality assurance process.
The findings from this study highlight both the promise and limitations of existing
models, particularly in unusual and rare cases.
Lastly, I introduce VideoGameBunny, a large multimodal model specifically
trained for video game content, accompanied by a dataset of 389,565 image-instruction
pairs. My analysis demonstrates that VideoGameBunny outperforms much larger
models in video game understanding tasks while using 4.2× fewer parameters. This
result underscores the effectiveness and promise of using a high-quality dataset to
improve models’ understanding of video games, thus making them more effective in
the game QA process.
Future work should focus on enhancing the generalization and robustness of AI
models in the gaming context, particularly through better integration of vision and
language components. This integration could be achieved using either early or late fu-
sion methods. For late fusion methods, where two pre-trained models are connected,
better alignment between these components can be achieved through improved train-
ing data and strategies. Alternatively, early fusion techniques, which involve training
both components simultaneously to enhance their integration, can overcome many
issues that existing models have.
Ian Gauk; Cor-Paul Bezemer
Detecting Discrepancies between Subtitles and Audio in Gameplay Videos with EchoTest Journal Article
IEEE Transactions on Games, 2024.
Abstract | BibTeX | Tags: Accessibility testing, Computer games, Game accessibility, Game development, Game testing
@article{Gauk_EchoTest,
title = {Detecting Discrepancies between Subtitles and Audio in Gameplay Videos with EchoTest},
author = {Ian Gauk and Cor-Paul Bezemer},
year = {2024},
date = {2024-07-30},
journal = {IEEE Transactions on Games},
abstract = {The landscape of accessibility features in video
games remains inconsistent, posing challenges for gamers who
seek experiences tailored to their needs. Accessibility features
such as subtitles are widely used by players but are difficult to
test manually due to the large scope of games and the variability
in how subtitles can appear.
In this paper, we introduce an automated approach
(ECHOTEST) to extract subtitles and spoken audio from a
gameplay video, convert them into text, and compare them to
detect discrepancies such as typos, desynchronization and missing
text. ECHOTEST can be used by game developers to identify
discrepancies between subtitles and spoken audio in their games,
enabling them to better test the accessibility of their games.
In an empirical study on gameplay videos from 15 popular
games, ECHOTEST can verify discrepancies between subtitles and
audio with a precision of 98% and a recall of 89%. In addition,
ECHOTEST performs well with a precision of 73% and a recall
of 99% on a challenging generated benchmark.},
keywords = {Accessibility testing, Computer games, Game accessibility, Game development, Game testing},
pubstate = {published},
tppubtype = {article}
}
games remains inconsistent, posing challenges for gamers who
seek experiences tailored to their needs. Accessibility features
such as subtitles are widely used by players but are difficult to
test manually due to the large scope of games and the variability
in how subtitles can appear.
In this paper, we introduce an automated approach
(ECHOTEST) to extract subtitles and spoken audio from a
gameplay video, convert them into text, and compare them to
detect discrepancies such as typos, desynchronization and missing
text. ECHOTEST can be used by game developers to identify
discrepancies between subtitles and spoken audio in their games,
enabling them to better test the accessibility of their games.
In an empirical study on gameplay videos from 15 popular
games, ECHOTEST can verify discrepancies between subtitles and
audio with a precision of 98% and a recall of 89%. In addition,
ECHOTEST performs well with a precision of 73% and a recall
of 99% on a challenging generated benchmark.