From f0f7a56c480e50b0472f7faafd5e561c6355398f Mon Sep 17 00:00:00 2001 From: Juanan Pereira Date: Thu, 9 Jan 2025 09:36:24 +0100 Subject: [PATCH] fix: improve YouTube transcript handling for manual captions --- .../open_webui/retrieval/loaders/youtube.py | 49 +++++++++++++------ package-lock.json | 2 +- package.json | 2 +- src/lib/i18n/locales/fr-FR/translation.json | 2 +- src/lib/i18n/locales/ie-GA/translation.json | 2 +- src/lib/i18n/locales/zh-CN/translation.json | 2 +- 6 files changed, 40 insertions(+), 19 deletions(-) diff --git a/backend/open_webui/retrieval/loaders/youtube.py b/backend/open_webui/retrieval/loaders/youtube.py index 63262a7ff..5a6d91a08 100644 --- a/backend/open_webui/retrieval/loaders/youtube.py +++ b/backend/open_webui/retrieval/loaders/youtube.py @@ -137,21 +137,42 @@ class YoutubeLoader: return [] try: - # First try to get transcript in requested language - transcript = transcript_list.find_transcript(self.language) - except NoTranscriptFound: - # Fallback: try to get any available transcript - available_transcripts = list( - transcript_list._generated_transcripts.values() - ) - if available_transcripts: - transcript = available_transcripts[0] - log.info( - f"Using first available transcript in language: {transcript.language_code}" - ) + # First try to get manual transcript in requested language + for lang in self.language: + try: + available_transcripts = ( + transcript_list._manually_created_transcripts + ) + if lang in available_transcripts: + transcript = available_transcripts[lang] + log.info(f"Found manual transcript in language: {lang}") + break + except NoTranscriptFound: + continue else: - log.error("No transcripts found for video") - return [] + # If no manual transcript found, try auto-generated ones + try: + transcript = transcript_list.find_transcript(self.language) + log.info( + f"Using auto-generated transcript in language: {transcript.language_code}" + ) + except NoTranscriptFound: + # Final fallback: try to get any available transcript + available_transcripts = list( + transcript_list._manually_created_transcripts.values() + ) + list(transcript_list._generated_transcripts.values()) + if available_transcripts: + transcript = available_transcripts[0] + log.info( + f"Using first available transcript in language: {transcript.language_code}" + ) + else: + log.error("No transcripts found for video") + return [] + + except Exception as e: + log.exception(f"Error fetching transcript: {str(e)}") + return [] transcript_pieces: List[Dict[str, Any]] = transcript.fetch() diff --git a/package-lock.json b/package-lock.json index f228148cd..3ae6220e3 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12589,4 +12589,4 @@ } } } -} \ No newline at end of file +} diff --git a/package.json b/package.json index 4da6fa88c..fec9eb728 100644 --- a/package.json +++ b/package.json @@ -110,4 +110,4 @@ "node": ">=18.13.0 <=22.x.x", "npm": ">=6.0.0" } -} \ No newline at end of file +} diff --git a/src/lib/i18n/locales/fr-FR/translation.json b/src/lib/i18n/locales/fr-FR/translation.json index db55088e4..84a8b6167 100644 --- a/src/lib/i18n/locales/fr-FR/translation.json +++ b/src/lib/i18n/locales/fr-FR/translation.json @@ -1062,4 +1062,4 @@ "Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "L'intégralité de votre contribution ira directement au développeur du plugin ; Open WebUI ne prend aucun pourcentage. Cependant, la plateforme de financement choisie peut avoir ses propres frais.", "Youtube": "YouTube", "Youtube Loader Settings": "Paramètres de l'outil de téléchargement YouTube" -} \ No newline at end of file +} diff --git a/src/lib/i18n/locales/ie-GA/translation.json b/src/lib/i18n/locales/ie-GA/translation.json index 4764e89b5..2e01fb124 100644 --- a/src/lib/i18n/locales/ie-GA/translation.json +++ b/src/lib/i18n/locales/ie-GA/translation.json @@ -1062,4 +1062,4 @@ "Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "Rachaidh do ranníocaíocht iomlán go díreach chuig an bhforbróir breiseán; Ní ghlacann Open WebUI aon chéatadán. Mar sin féin, d'fhéadfadh a tháillí féin a bheith ag an ardán maoinithe roghnaithe.", "Youtube": "Youtube", "Youtube Loader Settings": "Socruithe Luchtaire Youtube" -} \ No newline at end of file +} diff --git a/src/lib/i18n/locales/zh-CN/translation.json b/src/lib/i18n/locales/zh-CN/translation.json index ef73612b6..790a35c7b 100644 --- a/src/lib/i18n/locales/zh-CN/translation.json +++ b/src/lib/i18n/locales/zh-CN/translation.json @@ -1062,4 +1062,4 @@ "Your entire contribution will go directly to the plugin developer; Open WebUI does not take any percentage. However, the chosen funding platform might have its own fees.": "您的全部捐款将直接给到插件开发者,Open WebUI 不会收取任何比例。但众筹平台可能会有服务费、抽成。", "Youtube": "YouTube", "Youtube Loader Settings": "YouTube 爬取设置" -} \ No newline at end of file +}