@article{ShangZhuDanneretal.2024, author = {Shang, Aiguo and Zhu, Xinjuan and Danner, Michael and R{\"a}tsch, Matthias}, title = {Unsupervised question-retrieval approach based on topic keywords filtering and multi-task learning}, journal = {Computer Speech \& Language}, volume = {87}, issn = {0885-2308}, doi = {10.1016/j.csl.2024.101644}, institution = {Technik}, pages = {101644}, year = {2024}, abstract = {Currently, the majority of retrieval-based question-answering systems depend on supervised training using question pairs. However, there is still a significant need for further exploration of how to employ unsupervised methods to improve the accuracy of retrieval-based question-answering systems. From the perspective of question topic keywords, this paper presents TFCSG, an unsupervised question-retrieval approach based on topic keyword filtering and multi-task learning. Firstly, we design the topic keyword filtering algorithm, which, unlike the topic model, can sequentially filter out the keywords of the question and can provide a training corpus for subsequent unsupervised learning. Then, three tasks are designed in this paper to complete the training of the question-retrieval model. The first task is a question contrastive learning task based on topic keywords repetition strategy, the second is questions and its corresponding sequential topic keywords similarity distribution task, and the third is a sequential topic keywords generation task using questions. These three tasks are trained in parallel in order to obtain quality question representations and thus improve the accuracy of question-retrieval task. Finally, our experimental results on the four publicly available datasets demonstrate the effectiveness of the TFCSG, with an average improvement of 7.1\%, 4.4\%, and 3.5\% in the P@1, MAP, and MRR metrics when using the BERT model compared to the baseline model. The corresponding metrics improved by 5.7\%, 3.5\% and 3.0\% on average when using the RoBERTa model. The accuracy of unsupervised similar question-retrieval task is effectively improved. In particular, the values of P@1, P@5, and P@10 are close, the retrieved similar questions are ranked more advance.}, language = {en} }