( = Paper PDF,
= Presentation slides,
= Presentation video)
Hao Li; Cor-Paul Bezemer; Ahmed E. Hassan
Software Engineering and Foundation Models: Insights from Industry Blogs Using a Jury of Foundation Models Inproceedings
International Conference on Software Engineering - Software Engineering in Practice (ICSE - SEIP) Track, 2025.
Abstract | BibTeX | Tags: FM4SE, Foundation models, SE4AI, SE4FM, SE4ML
@inproceedings{Li_SEFM_blogs,
title = {Software Engineering and Foundation Models: Insights from Industry Blogs Using a Jury of Foundation Models},
author = {Hao Li and Cor-Paul Bezemer and Ahmed E. Hassan},
year = {2025},
date = {2025-04-27},
booktitle = {International Conference on Software Engineering - Software Engineering in Practice (ICSE - SEIP) Track},
abstract = {Foundation models (FMs) such as large language
models (LLMs) have significantly impacted many fields, including
software engineering (SE). The interaction between SE and FMs
has led to the integration of FMs into SE practices (FM4SE)
and the application of SE methodologies to FMs (SE4FM). While
several literature surveys exist on academic contributions to these
trends, we are the first to provide a practitioner’s view. We
analyze 155 FM4SE and 997 SE4FM blog posts from leading
technology companies, leveraging an FM-powered surveying
approach to systematically label and summarize the discussed
activities and tasks. We observed that while code generation is the
most prominent FM4SE task, FMs are leveraged for many other
SE activities such as code understanding, summarization, and
API recommendation. The majority of blog posts on SE4FM are
about model deployment & operation, and system architecture
& orchestration. Although the emphasis is on cloud deployments,
there is a growing interest in compressing FMs and deploying
them on smaller devices such as edge or mobile devices. We
outline eight future research directions inspired by our gained
insights, aiming to bridge the gap between academic findings
and real-world applications. Our study not only enriches the
body of knowledge on practical applications of FM4SE and
SE4FM but also demonstrates the utility of FMs as a powerful
and efficient approach in conducting literature surveys within
technical and grey literature domains. Our dataset, results, code
and used prompts can be found in our online replication package
at https://zenodo.org/records/14563992.},
keywords = {FM4SE, Foundation models, SE4AI, SE4FM, SE4ML},
pubstate = {published},
tppubtype = {inproceedings}
}
models (LLMs) have significantly impacted many fields, including
software engineering (SE). The interaction between SE and FMs
has led to the integration of FMs into SE practices (FM4SE)
and the application of SE methodologies to FMs (SE4FM). While
several literature surveys exist on academic contributions to these
trends, we are the first to provide a practitioner’s view. We
analyze 155 FM4SE and 997 SE4FM blog posts from leading
technology companies, leveraging an FM-powered surveying
approach to systematically label and summarize the discussed
activities and tasks. We observed that while code generation is the
most prominent FM4SE task, FMs are leveraged for many other
SE activities such as code understanding, summarization, and
API recommendation. The majority of blog posts on SE4FM are
about model deployment & operation, and system architecture
& orchestration. Although the emphasis is on cloud deployments,
there is a growing interest in compressing FMs and deploying
them on smaller devices such as edge or mobile devices. We
outline eight future research directions inspired by our gained
insights, aiming to bridge the gap between academic findings
and real-world applications. Our study not only enriches the
body of knowledge on practical applications of FM4SE and
SE4FM but also demonstrates the utility of FMs as a powerful
and efficient approach in conducting literature surveys within
technical and grey literature domains. Our dataset, results, code
and used prompts can be found in our online replication package
at https://zenodo.org/records/14563992.
Mohammad Reza Taesiri; Cor-Paul Bezemer
VIDEOGAMEBUNNY: Towards vision assistants for video games Inproceedings
Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision, 2025.
Abstract | BibTeX | Tags: Computer games, Foundation models, Game development, Game testing
@inproceedings{Taesiri_VideoGameBunny,
title = {VIDEOGAMEBUNNY: Towards vision assistants for video games},
author = {Mohammad Reza Taesiri and Cor-Paul Bezemer },
year = {2025},
date = {2025-03-01},
booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision},
abstract = {Large multimodal models (LMMs) hold substantial
promise across various domains, from personal assistance
in daily tasks to sophisticated applications like medical di-
agnostics. However, their capabilities have limitations in
the video game domain, such as challenges with scene un-
derstanding, hallucinations, and inaccurate descriptions of
video game content, especially in open-source models. This
paper describes the development of VIDEOGAMEBUNNY,
a LLaVA-style model based on Bunny, specifically tailored
for understanding images from video games. We release
intermediate checkpoints, training logs, and an extensive
dataset comprising 185,259 video game images from 413
titles, along with 389,565 image-instruction pairs that in-
clude image captions, question-answer pairs, and a JSON
representation of 16 elements of 136,974 images. Our ex-
periments show that our high quality game-related data
has the potential to make a relatively small model outper-
form the much larger state-of-the-art model LLaVa-1.6-34b
(which has more than 4x the number of parameters). Our
study paves the way for future research in video game un-
derstanding on tasks such as playing, commentary, and
debugging. Code and data are available at: https://videogamebunny.github.io/},
keywords = {Computer games, Foundation models, Game development, Game testing},
pubstate = {published},
tppubtype = {inproceedings}
}
promise across various domains, from personal assistance
in daily tasks to sophisticated applications like medical di-
agnostics. However, their capabilities have limitations in
the video game domain, such as challenges with scene un-
derstanding, hallucinations, and inaccurate descriptions of
video game content, especially in open-source models. This
paper describes the development of VIDEOGAMEBUNNY,
a LLaVA-style model based on Bunny, specifically tailored
for understanding images from video games. We release
intermediate checkpoints, training logs, and an extensive
dataset comprising 185,259 video game images from 413
titles, along with 389,565 image-instruction pairs that in-
clude image captions, question-answer pairs, and a JSON
representation of 16 elements of 136,974 images. Our ex-
periments show that our high quality game-related data
has the potential to make a relatively small model outper-
form the much larger state-of-the-art model LLaVa-1.6-34b
(which has more than 4x the number of parameters). Our
study paves the way for future research in video game un-
derstanding on tasks such as playing, commentary, and
debugging. Code and data are available at: https://videogamebunny.github.io/
Mohammad Reza Taesiri; Tianjun Feng; Anh Nguyen; Cor-Paul Bezemer
GlitchBench: Can large multimodal models detect video game glitches? Inproceedings
IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024.
Abstract | BibTeX | Tags: Computer games, Foundation models, Game development, Gameplay videos, LLM
@inproceedings{TaesiriCVPR2024,
title = {GlitchBench: Can large multimodal models detect video game glitches?},
author = {Mohammad Reza Taesiri and Tianjun Feng and Anh Nguyen and Cor-Paul Bezemer},
year = {2024},
date = {2024-06-15},
urldate = {2024-03-15},
booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
abstract = {Large multimodal models (LMMs) have evolved from
large language models (LLMs) to integrate multiple input
modalities, such as visual inputs. This integration augments
the capacity of LLMs for tasks requiring visual comprehen-
sion and reasoning. However, the extent and limitations of
their enhanced abilities are not fully understood, especially
when it comes to real-world tasks. To address this gap, we
introduce GlitchBench, a novel benchmark derived from
video game quality assurance tasks, to test and evaluate the
reasoning capabilities of LMMs. Our benchmark is curated
from a variety of unusual and glitched scenarios from video
games and aims to challenge both the visual and linguis-
tic reasoning powers of LMMs in detecting and interpreting
out-of-the-ordinary events. We evaluate multiple state-of-
the-art LMMs, and we show that GlitchBench presents a
new challenge for these models. Code and data are avail-
able at: https://glitchbench.github.io/},
keywords = {Computer games, Foundation models, Game development, Gameplay videos, LLM},
pubstate = {published},
tppubtype = {inproceedings}
}
large language models (LLMs) to integrate multiple input
modalities, such as visual inputs. This integration augments
the capacity of LLMs for tasks requiring visual comprehen-
sion and reasoning. However, the extent and limitations of
their enhanced abilities are not fully understood, especially
when it comes to real-world tasks. To address this gap, we
introduce GlitchBench, a novel benchmark derived from
video game quality assurance tasks, to test and evaluate the
reasoning capabilities of LMMs. Our benchmark is curated
from a variety of unusual and glitched scenarios from video
games and aims to challenge both the visual and linguis-
tic reasoning powers of LMMs in detecting and interpreting
out-of-the-ordinary events. We evaluate multiple state-of-
the-art LMMs, and we show that GlitchBench presents a
new challenge for these models. Code and data are avail-
able at: https://glitchbench.github.io/