klinger.bib

@misc{li2024ipropinteractivepromptoptimization,
  title = {iPrOp: Interactive Prompt Optimization for Large
                  Language Models with a Human in the Loop},
  author = {Jiahui Li and Roman Klinger},
  year = {2024},
  eprint = {2412.12644},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2412.12644},
  internaltype = {preprint}
}

@misc{wuehrl2024selfadaptiveparaphrasingpreferencelearning,
  title = {Self-Adaptive Paraphrasing and Preference Learning for Improved Claim Verifiability},
  author = {Amelie W\"uhrl and Roman Klinger},
  year = {2024},
  eprint = {2412.11653},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2412.11653},
  internaltype = {preprint}
}

@misc{papay2024regularpatternsensitivecrfsdistantlabel,
  title = {Regular-pattern-sensitive CRFs for Distant Label
                  Interactions},
  author = {Sean Papay and Roman Klinger and Sebastian Pado},
  year = {2024},
  eprint = {2411.12484},
  archiveprefix = {arXiv},
  primaryclass = {cs.LG},
  url = {https://arxiv.org/abs/2411.12484},
  internaltype = {preprint}
}

@misc{bamnlp2024,
  title = {Which Demographics do LLMs Default to During
                  Annotation?},
  author = {Johannes Sch\"afer and Aidan Combs and Christopher
                  Bagdon and Jiahui Li and Nadine Probol and Lynn
                  Greschner and Sean Papay and Yarik Menchaca Resendiz
                  and Aswathy Velutharambath and Amelie W\"uhrl and
                  Sabine Weber and Roman Klinger},
  year = {2024},
  eprint = {2410.08820},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2410.08820},
  internaltype = {preprint}
}

@inproceedings{troiano-etal-2024-dealing,
  title = {Dealing with Controversy: An Emotion and Coping
                  Strategy Corpus Based on Role Playing},
  author = {Troiano, Enrica and Labat, Sofie and Stranisci,
                  Marco and Damiano, Rossana and Patti, Viviana and
                  Klinger, Roman},
  editor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen,
                  Yun-Nung},
  booktitle = {Findings of the Association for Computational
                  Linguistics: EMNLP 2024},
  month = nov,
  year = {2024},
  address = {Miami, Florida, USA},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.findings-emnlp.89},
  pages = {1634--1658},
  abstract = {There is a mismatch between psychological and
                  computational studies on emotions. Psychological
                  research aims at explaining and documenting internal
                  mechanisms of these phenomena, while computational
                  work often simplifies them into labels. Many emotion
                  fundamentals remain under-explored in natural
                  language processing, particularly how emotions
                  develop and how people cope with them. To help
                  reduce this gap, we follow theories on coping, and
                  treat emotions as strategies to cope with salient
                  situations (i.e., how people deal with
                  emotion-eliciting events). This approach allows us
                  to investigate the link between emotions and
                  behavior, which also emerges in language. We
                  introduce the task of coping identification,
                  together with a corpus to do so, constructed via
                  role-playing. We find that coping strategies realize
                  in text even though they are challenging to
                  recognize, both for humans and automatic systems
                  trained and prompted on the same task. We thus open
                  up a promising research direction to enhance the
                  capability of models to better capture emotion
                  mechanisms from text.},
  internaltype = {conferenceproc},
  pdf = {https://www.romanklinger.de/publications/TroianoLabatStranisciDamianoPattiKlinger_EMNLP-Findings2024.pdf},
  eprint = {2409.19025},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2409.19025}
}

@inproceedings{velutharambath-etal-2024-entangled,
  title = {How Entangled is Factuality and Deception in
                  {G}erman?},
  author = {Velutharambath, Aswathy and Wuehrl, Amelie and
                  Klinger, Roman},
  editor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen,
                  Yun-Nung},
  booktitle = {Findings of the Association for Computational
                  Linguistics: EMNLP 2024},
  month = nov,
  year = {2024},
  address = {Miami, Florida, USA},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.findings-emnlp.557},
  pages = {9538--9554},
  abstract = {The statement {``}The earth is flat{''} is factually
                  inaccurate, but if someone truly believes and argues
                  in its favor, it is not deceptive. Research on
                  deception detection and fact checking often
                  conflates factual accuracy with the truthfulness of
                  statements. This assumption makes it difficult to
                  (a) study subtle distinctions and interactions
                  between the two and (b) gauge their effects on
                  downstream tasks. The belief-based deception
                  framework disentangles these properties by defining
                  texts as deceptive when there is a mismatch between
                  what people say and what they truly believe. In this
                  study, we assess if presumed patterns of deception
                  generalize to German language texts. We test the
                  effectiveness of computational models in detecting
                  deception using an established corpus of
                  belief-based argumentation. Finally, we gauge the
                  impact of deception on the downstream task of fact
                  checking and explore if this property confounds
                  verification models. Surprisingly, our analysis
                  finds no correlation with established cues of
                  deception. Previous work claimed that computational
                  models can outperform humans in deception detection
                  accuracy, however, our experiments show that both
                  traditional and state-of-the-art models struggle
                  with the task, performing no better than random
                  guessing. For fact checking, we find that natural
                  language inference-based verification performs worse
                  on non-factual and deceptive content, while
                  prompting large language models for the same task is
                  less sensitive to these properties.},
  internaltype = {conferenceproc},
  pdf = {https://www.romanklinger.de/publications/VelutharambathWuehrlKlinger-EMNLP-Findings2024.pdf},
  eprint = {2409.20165},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2409.20165}
}

@misc{HofmannSindermannKlinger2024,
  title = {Prompt-based Personality Profiling: Reinforcement Learning for Relevance Filtering},
  author = {Jan Hofmann and Cornelia Sindermann and Roman Klinger},
  year = {2024},
  eprint = {2409.04122},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {http://arxiv.org/abs/2409.04122},
  internaltype = {preprint}
}

@proceedings{wassa-2024-approaches,
  title = {Proceedings of the 14th Workshop on Computational
                  Approaches to Subjectivity, Sentiment, {\&} Social
                  Media Analysis},
  editor = {De Clercq, Orph{\'e}e and Barriere, Valentin and
                  Barnes, Jeremy and Klinger, Roman and Sedoc,
                  Jo{\~a}o and Tafreshi, Shabnam},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.wassa-1.0},
  internaltype = {edited}
}

@inproceedings{Wuehrl2024,
  title = {{IMS}{\_}medic{ALY} at {\#}{SMM}4{H} 2024: Detecting
                  Impacts of Outdoor Spaces on Social Anxiety with
                  Data Augmented Ensembling},
  author = {Wuehrl, Amelie and Greschner, Lynn and Menchaca
                  Resendiz, Yarik and Klinger, Roman},
  editor = {Xu, Dongfang and Gonzalez-Hernandez, Graciela},
  booktitle = {Proceedings of The 9th Social Media Mining for
                  Health Research and Applications (SMM4H 2024)
                  Workshop and Shared Tasks},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.smm4h-1.19},
  pages = {83--87},
  abstract = {Many individuals affected by Social Anxiety Disorder
                  turn to social media platforms to share their
                  experiences and seek advice. This includes
                  discussing the potential benefits of engaging with
                  outdoor environments. As part of {\#}SMM4H 2024,
                  Shared Task 3 focuses on classifying the effects of
                  outdoor spaces on social anxiety symptoms in Reddit
                  posts. In our contribution to the task, we explore
                  the effectiveness of domain-specific models (trained
                  on social media data {--} SocBERT) against general
                  domain models (trained on diverse datasets {--}
                  BERT, RoBERTa, GPT-3.5) in predicting the sentiment
                  related to outdoor spaces. Further, we assess the
                  benefits of augmenting sparse human-labeled data
                  with synthetic training instances and evaluate the
                  complementary strengths of domain-specific and
                  general classifiers using an ensemble model. Our
                  results show that (1) fine-tuning small,
                  domain-specific models generally outperforms large
                  general language models in most cases. Only one
                  large language model (GPT-4) exhibits performance
                  comparable to the fine-tuned models (52{\%}
                  F1). Further, we find that (2) synthetic data does
                  improve the performance of fine-tuned models in some
                  cases, and (3) models do not appear to complement
                  each other in our ensemble setup.},
  internaltype = {workshop}
}

@inproceedings{Schaefer2024,
  title = {Hierarchical Adversarial Correction to Mitigate
                  Identity Term Bias in Toxicity Detection},
  author = {Sch{\"a}fer, Johannes and Heid, Ulrich and Klinger,
                  Roman},
  editor = {De Clercq, Orph{\'e}e and Barriere, Valentin and
                  Barnes, Jeremy and Klinger, Roman and Sedoc,
                  Jo{\~a}o and Tafreshi, Shabnam},
  booktitle = {Proceedings of the 14th Workshop on Computational
                  Approaches to Subjectivity, Sentiment, {\&} Social
                  Media Analysis},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.wassa-1.4},
  pdf = {https://www.romanklinger.de/publications/SchaeferHeidKlingerWASSA2024.pdf},
  pages = {35--51},
  abstract = {Corpora that are the fundament for toxicity
                  detection contain such expressions typically
                  directed against a target individual or group, e.g.,
                  people of a specific gender or ethnicity. Prior work
                  has shown that the target identity mention can
                  constitute a confounding variable. As an example, a
                  model might learn that Christians are always
                  mentioned in the context of hate speech. This
                  misguided focus can lead to a limited generalization
                  to newly emerging targets that are not found in the
                  training data. In this paper, we hypothesize and
                  subsequently show that this issue can be mitigated
                  by considering targets on different levels of
                  specificity. We distinguish levels of (1) the
                  existence of a target, (2) a class (e.g., that the
                  target is a religious group), or (3) a specific
                  target group (e.g., Christians or Muslims). We
                  define a target label hierarchy based on these three
                  levels and then exploit this hierarchy in an
                  adversarial correction for the lowest level
                  (i.e. (3)) while maintaining some basic target
                  features. This approach does not lower the toxicity
                  detection performance but increases the
                  generalization to targets not being available at
                  training time.},
  internaltype = {workshop}
}

@inproceedings{Ronningstad2024,
  title = {Entity-Level Sentiment: More than the Sum of Its
                  Parts},
  author = {R{\o}nningstad, Egil and Klinger, Roman and Velldal,
                  Erik and {\O}vrelid, Lilja},
  editor = {De Clercq, Orph{\'e}e and Barriere, Valentin and
                  Barnes, Jeremy and Klinger, Roman and Sedoc,
                  Jo{\~a}o and Tafreshi, Shabnam},
  booktitle = {Proceedings of the 14th Workshop on Computational
                  Approaches to Subjectivity, Sentiment, {\&} Social
                  Media Analysis},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.wassa-1.8},
  pages = {84--96},
  abstract = {In sentiment analysis of longer texts, there may be
                  a variety of topics discussed, of entities
                  mentioned, and of sentiments expressed regarding
                  each entity. We find a lack of studies exploring how
                  such texts express their sentiment towards each
                  entity of interest, and how these sentiments can be
                  modelled. In order to better understand how
                  sentiment regarding persons and organizations (each
                  entity in our scope) is expressed in longer texts,
                  we have collected a dataset of expert annotations
                  where the overall sentiment regarding each entity is
                  identified, together with the sentence-level
                  sentiment for these entities separately. We show
                  that the reader{'}s perceived sentiment regarding an
                  entity often differs from an arithmetic aggregation
                  of sentiments at the sentence level. Only 70{\%} of
                  the positive and 55{\%} of the negative entities
                  receive a correct overall sentiment label when we
                  aggregate the (human-annotated) sentiment labels for
                  the sentences where the entity is mentioned. Our
                  dataset reveals the complexity of entity-specific
                  sentiment in longer texts, and allows for more
                  precise modelling and evaluation of such sentiment
                  expressions.},
  internaltype = {workshop},
  archiveprefix = {arXiv},
  eprint = {2407.03916},
  pdf = {https://www.romanklinger.de/publications/RønningstadKlingerVelldalØvrelid_WASSA2024.pdf}
}

@inproceedings{bagdon-etal-2024-expert,
  title = {{``}You are an expert annotator{''}: Automatic Best{--}Worst-Scaling Annotations for Emotion Intensity Modeling},
  author = {Bagdon, Christopher  and
      Karmalkar, Prathamesh  and
      Gurulingappa, Harsha  and
      Klinger, Roman},
  editor = {Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven},
  booktitle = {Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
  month = jun,
  year = {2024},
  address = {Mexico City, Mexico},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.naacl-long.439},
  pages = {7917--7929},
  abstract = {Labeling corpora constitutes a bottleneck to create models for new tasks or domains. Large language models mitigate the issue with automatic corpus labeling methods, particularly for categorical annotations. Some NLP tasks such as emotion intensity prediction, however, require text regression, but there is no work on automating annotations for continuous label assignments. Regression is considered more challenging than classification: The fact that humans perform worse when tasked to choose values from a rating scale lead to comparative annotation methods, including best{--}worst scaling. This raises the question if large language model-based annotation methods show similar patterns, namely that they perform worse on rating scale annotation tasks than on comparative annotation tasks. To study this, we automate emotion intensity predictions and compare direct rating scale predictions, pairwise comparisons and best{--}worst scaling. We find that the latter shows the highest reliability. A transformer regressor fine-tuned on these data performs nearly on par with a model trained on the original manual annotations.},
  internaltype = {conferenceproc},
  url = {https://www.romanklinger.de/publications/BagdonNAACL2024.pdf}
}

@inproceedings{Wuehrl2024b,
  title = {Understanding Fine-grained Distortions in Reports of Scientific Findings},
  author = {Wuehrl, Amelie  and
      Wright, Dustin  and
      Klinger, Roman  and
      Augenstein, Isabelle},
  editor = {Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek},
  booktitle = {Findings of the Association for Computational Linguistics ACL 2024},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand and virtual meeting},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.findings-acl.369},
  pages = {6175--6191},
  abstract = {Distorted science communication harms individuals and society as it can lead to unhealthy behavior change and decrease trust in scientific institutions. Given the rapidly increasing volume of science communication in recent years, a fine-grained understanding of how findings from scientific publications are reported to the general public, and methods to detect distortions from the original work automatically, are crucial. Prior work focused on individual aspects of distortions or worked with unpaired data. In this work, we make three foundational contributions towards addressing this problem: (1) annotating 1,600 instances of scientific findings from academic papers paired with corresponding findings as reported in news articles and tweets wrt. four characteristics: causality, certainty, generality and sensationalism; (2) establishing baselines for automatically detecting these characteristics; and (3) analyzing the prevalence of changes in these characteristics in both human-annotated and large-scale unlabeled data. Our results show that scientific findings frequently undergo subtle distortions when reported. Tweets distort findings more often than science news reports. Detecting fine-grained distortions automatically poses a challenging task. In our experiments, fine-tuned task-specific models consistently outperform few-shot LLM prompting.},
  pdf = {https://www.romanklinger.de/publications/WuehrlEtAlACLFindings2024.pdf},
  archiveprefix = {arXiv},
  eprint = {2402.12431},
  internaltype = {conferenceproc}
}

@inproceedings{Wemmer2024,
  title = {{E}mo{P}rogress: Cumulated Emotion Progression
                  Analysis in Dreams and Customer Service Dialogues},
  author = {Wemmer, Eileen and Labat, Sofie and Klinger, Roman},
  editor = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste,
                  Veronique and Lenci, Alessandro and Sakti, Sakriani
                  and Xue, Nianwen},
  booktitle = {Proceedings of the 2024 Joint International
                  Conference on Computational Linguistics, Language
                  Resources and Evaluation (LREC-COLING 2024)},
  month = may,
  year = {2024},
  address = {Torino, Italy},
  publisher = {ELRA and ICCL},
  url = {https://aclanthology.org/2024.lrec-main.503},
  pages = {5660--5677},
  pdf = {https://www.romanklinger.de/publications/WemmerLabatKlingerLRECCOLING2024.pdf},
  internaltype = {conferenceproc}
}

@inproceedings{Velutharambath2024,
  title = {Can Factual Statements Be Deceptive? The
                  {D}e{F}a{B}el Corpus of Belief-based Deception},
  author = {Velutharambath, Aswathy and W{\"u}hrl, Amelie and
                  Klinger, Roman},
  editor = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste,
                  Veronique and Lenci, Alessandro and Sakti, Sakriani
                  and Xue, Nianwen},
  booktitle = {Proceedings of the 2024 Joint International
                  Conference on Computational Linguistics, Language
                  Resources and Evaluation (LREC-COLING 2024)},
  month = may,
  year = {2024},
  address = {Torino, Italy},
  publisher = {ELRA and ICCL},
  url = {https://aclanthology.org/2024.lrec-main.243},
  pages = {2708--2723},
  internaltype = {conferenceproc},
  pdf = {https://www.romanklinger.de/publications/VelutharambathWuehrlKlinger-LREC-COLING2024.pdf},
  eprint = {2403.10185},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL}
}

@inproceedings{Barreiss20242,
  author = {Barei\ss{}, Patrick and Klinger, Roman and Barnes,
                  Jeremy},
  title = {English Prompts are Better for {NLI}-based Zero-Shot
                  Emotion Classification than Target-Language Prompts},
  year = {2024},
  isbn = {9798400701726},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3589335.3651902},
  doi = {10.1145/3589335.3651902},
  abstract = {Emotion classification in text is a challenging task
                  due to the processes involved when interpreting a
                  textual description of a potential emotion
                  stimulus. In addition, the set of emotion categories
                  is highly domain-specific. For instance, literature
                  analysis might require the use of aesthetic emotions
                  (e.g., finding something beautiful), and social
                  media analysis could benefit from fine-grained sets
                  (e.g., separating anger from annoyance) than only
                  those that represent basic categories as they have
                  been proposed by Paul Ekman (anger, disgust, fear,
                  joy, surprise, sadness). This renders the task an
                  interesting field for zero-shot classifications, in
                  which the label set is not known at model
                  development time. Unfortunately, most resources for
                  emotion analysis are English, and therefore, most
                  studies on emotion analysis have been performed in
                  English, including those that involve prompting
                  language models for text labels. This leaves us with
                  a research gap that we address in this paper: In
                  which language should we prompt for emotion labels
                  on non-English texts? This is particularly of
                  interest when we have access to a multilingual large
                  language model, because we could request labels with
                  English prompts even for non-English data. Our
                  experiments with natural language inference-based
                  language models show that it is consistently better
                  to use English prompts even if the data is in a
                  different language.},
  booktitle = {Companion Proceedings of the ACM on Web Conference
                  2024},
  pages = {1318–1326},
  numpages = {9},
  location = {Singapore, Singapore},
  series = {WWW '24},
  internaltype = {workshop}
}

@inproceedings{wegge-klinger-2024-topic,
  title = {Topic Bias in Emotion Classification},
  author = {Wegge, Maximilian and Klinger, Roman},
  editor = {van der Goot, Rob and Bak, JinYeong and
                  M{\"u}ller-Eberstein, Max and Xu, Wei and Ritter,
                  Alan and Baldwin, Tim},
  booktitle = {Proceedings of the Ninth Workshop on Noisy and
                  User-generated Text (W-NUT 2024)},
  month = mar,
  year = {2024},
  address = {San {\.G}iljan, Malta},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.wnut-1.9},
  pages = {89--103},
  abstract = {Emotion corpora are typically sampled based on
                  keyword/hashtag search or by asking study
                  participants to generate textual instances. In any
                  case, these corpora are not uniform samples
                  representing the entirety of a domain. We
                  hypothesize that this practice of data acquision
                  leads to unrealistic correlations between
                  overrepresented topics in these corpora that harm
                  the generalizability of models. Such topic bias
                  could lead to wrong predictions for instances like
                  {``}I organized the service for my aunt{'}s
                  funeral.{''} when funeral events are overpresented
                  for instances labeled with sadness, despite the
                  emotion of pride being more appropriate here. In
                  this paper, we study this topic bias both from the
                  data and the modeling perspective. We first label a
                  set of emotion corpora automatically via topic
                  modeling and show that emotions in fact correlate
                  with specific topics. Further, we see that emotion
                  classifiers are confounded by such topics. Finally,
                  we show that the established debiasing method of
                  adversarial correction via gradient reversal
                  mitigates the issue. Our work points out issues with
                  existing emotion corpora and that more
                  representative resources are required for fair
                  evaluation of models predicting affective concepts
                  from text.},
  internaltype = {workshop},
  pdf = {https://www.romanklinger.de/publications/WeggeKlinger2024.pdf},
  eprint = {2312.09043},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL}
}

@inproceedings{wuehrl-etal-2024-makes,
  title = {What Makes Medical Claims (Un)Verifiable? Analyzing
                  Entity and Relation Properties for Fact
                  Verification},
  author = {Wührl, Amelie and Menchaca Resendiz, Yarik and
                  Grimminger, Lara and Klinger, Roman},
  editor = {Graham, Yvette and Purver, Matthew},
  booktitle = {Proceedings of the 18th Conference of the European
                  Chapter of the Association for Computational
                  Linguistics (Volume 1: Long Papers)},
  month = mar,
  year = {2024},
  address = {St. Julian{'}s, Malta},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.eacl-long.124},
  pages = {2046--2058},
  abstract = {Verifying biomedical claims fails if no evidence can
                  be discovered. In these cases, the fact-checking
                  verdict remains unknown and the claim is
                  unverifiable. To improve this situation, we have to
                  understand if there are any claim properties that
                  impact its verifiability. In this work we assume
                  that entities and relations define the core
                  variables in a biomedical claim{'}s anatomy and
                  analyze if their properties help us to differentiate
                  verifiable from unverifiable claims. In a study with
                  trained annotation experts we prompt them to find
                  evidence for biomedical claims, and observe how they
                  refine search queries for their evidence
                  search. This leads to the first corpus for
                  scientific fact verification annotated with
                  subject{--}relation{--}object triplets, evidence
                  documents, and fact-checking verdicts (the BEAR-FACT
                  corpus). We find (1) that discovering evidence for
                  negated claims (e.g., X{--}does-not-cause{--}Y) is
                  particularly challenging. Further, we see that
                  annotators process queries mostly by adding
                  constraints to the search and by normalizing
                  entities to canonical names. (2) We compare our
                  in-house annotations with a small crowdsourcing
                  setting where we employ both medical experts and
                  laypeople. We find that domain expertise does not
                  have a substantial effect on the reliability of
                  annotations. Finally, (3), we demonstrate that it is
                  possible to reliably estimate the success of
                  evidence retrieval purely from the claim text
                  (.82F$_1$), whereas identifying unverifiable claims
                  proves more challenging (.27F$_1$)},
  pdf = {https://www.romanklinger.de/publications/Wuehrl-etal-2024-EACL.pdf},
  eprint = {2402.01360},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  internaltype = {conferenceproc}
}