feat: Add control for how message content is split for TTS generation reqs

2025-12-16 11:57:51 +01:00 · 2024-08-24 20:35:42 -04:00
parent f30428754f
commit 3967c34261
8 changed files with 277 additions and 157 deletions
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@@ -37,6 +37,7 @@ from config import (
    AUDIO_TTS_ENGINE,
    AUDIO_TTS_MODEL,
    AUDIO_TTS_VOICE,
    AUDIO_TTS_SPLIT_ON,
    AppConfig,
    CORS_ALLOW_ORIGIN,
 )
@@ -72,6 +73,7 @@ app.state.config.TTS_ENGINE = AUDIO_TTS_ENGINE
 app.state.config.TTS_MODEL = AUDIO_TTS_MODEL
 app.state.config.TTS_VOICE = AUDIO_TTS_VOICE
 app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY
 app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON
 # setting device type for whisper model
 whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu"
@@ -88,6 +90,7 @@ class TTSConfigForm(BaseModel):
    ENGINE: str
    MODEL: str
    VOICE: str
    SPLIT_ON: str
 class STTConfigForm(BaseModel):
@@ -139,6 +142,7 @@ async def get_audio_config(user=Depends(get_admin_user)):
            "ENGINE": app.state.config.TTS_ENGINE,
            "MODEL": app.state.config.TTS_MODEL,
            "VOICE": app.state.config.TTS_VOICE,
            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
        },
        "stt": {
            "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
@@ -159,6 +163,7 @@ async def update_audio_config(
    app.state.config.TTS_ENGINE = form_data.tts.ENGINE
    app.state.config.TTS_MODEL = form_data.tts.MODEL
    app.state.config.TTS_VOICE = form_data.tts.VOICE
    app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON
    app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL
    app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY
@@ -173,6 +178,7 @@ async def update_audio_config(
            "ENGINE": app.state.config.TTS_ENGINE,
            "MODEL": app.state.config.TTS_MODEL,
            "VOICE": app.state.config.TTS_VOICE,
            "SPLIT_ON": app.state.config.TTS_SPLIT_ON,
        },
        "stt": {
            "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL,
--- a/backend/config.py
+++ b/backend/config.py
@@ -1484,3 +1484,9 @@ AUDIO_TTS_VOICE = PersistentConfig(
    "audio.tts.voice",
    os.getenv("AUDIO_TTS_VOICE", "alloy"),  # OpenAI default voice
 )
 AUDIO_TTS_SPLIT_ON = PersistentConfig(
    "AUDIO_TTS_SPLIT_ON",
    "audio.tts.split_on",
    os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"),
 )
--- a/backend/main.py
+++ b/backend/main.py
@@ -1924,6 +1924,7 @@ async def get_app_config(request: Request):
                    "tts": {
                        "engine": audio_app.state.config.TTS_ENGINE,
                        "voice": audio_app.state.config.TTS_VOICE,
                        "split_on": audio_app.state.config.TTS_SPLIT_ON,
                    },
                    "stt": {
                        "engine": audio_app.state.config.STT_ENGINE,
--- a/src/lib/apis/audio/index.ts
+++ b/src/lib/apis/audio/index.ts
@@ -132,7 +132,11 @@ export const synthesizeOpenAISpeech = async (
 	return res;
 };
-export const getModels = async (token: string = '') => {
+interface AvailableModelsResponse {
 	models: { name: string; id: string }[] | { id: string }[];
 }
 export const getModels = async (token: string = ''): Promise<AvailableModelsResponse> => {
 	let error = null;
 	const res = await fetch(`${AUDIO_API_BASE_URL}/models`, {
--- a/src/lib/components/admin/Settings/Audio.svelte
+++ b/src/lib/components/admin/Settings/Audio.svelte
@@ -10,31 +10,36 @@
 		getModels as _getModels,
 		getVoices as _getVoices
 	} from '$lib/apis/audio';
-	import { user, settings, config } from '$lib/stores';
+	import { config } from '$lib/stores';
 	import SensitiveInput from '$lib/components/common/SensitiveInput.svelte';
-	const i18n = getContext('i18n');
+	import { TTS_RESPONSE_SPLIT } from '$lib/types';
-	export let saveHandler: Function;
+	import type { Writable } from 'svelte/store';
 	import type { i18n as i18nType } from 'i18next';
 	const i18n = getContext<Writable<i18nType>>('i18n');
 	export let saveHandler: () => void;
 	// Audio
 	let TTS_OPENAI_API_BASE_URL = '';
 	let TTS_OPENAI_API_KEY = '';
 	let TTS_API_KEY = '';
 	let TTS_ENGINE = '';
 	let TTS_MODEL = '';
 	let TTS_VOICE = '';
 	let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION;
 	let STT_OPENAI_API_BASE_URL = '';
 	let STT_OPENAI_API_KEY = '';
 	let STT_ENGINE = '';
 	let STT_MODEL = '';
-	let voices = [];
+	// eslint-disable-next-line no-undef
-	let models = [];
+	let voices: SpeechSynthesisVoice[] = [];
-	let nonLocalVoices = false;
+	let models: Awaited<ReturnType<typeof _getModels>>['models'] = [];
 	const getModels = async () => {
 		if (TTS_ENGINE === '') {
@@ -53,8 +58,8 @@
 	const getVoices = async () => {
 		if (TTS_ENGINE === '') {
-			const getVoicesLoop = setInterval(async () => {
+			const getVoicesLoop = setInterval(() => {
-				voices = await speechSynthesis.getVoices();
+				voices = speechSynthesis.getVoices();
 				// do your loop
 				if (voices.length > 0) {
@@ -81,7 +86,8 @@
 				API_KEY: TTS_API_KEY,
 				ENGINE: TTS_ENGINE,
 				MODEL: TTS_MODEL,
-				VOICE: TTS_VOICE
+				VOICE: TTS_VOICE,
 				SPLIT_ON: TTS_SPLIT_ON
 			},
 			stt: {
 				OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL,
@@ -92,9 +98,8 @@
 		});
 		if (res) {
-			toast.success($i18n.t('Audio settings updated successfully'));
+			saveHandler();
-
+			getBackendConfig().then(config.set).catch(() => {});
 			config.set(await getBackendConfig());
 		}
 	};
@@ -111,6 +116,8 @@
 			TTS_MODEL = res.tts.MODEL;
 			TTS_VOICE = res.tts.VOICE;
 			TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION;
 			STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL;
 			STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY;
@@ -139,7 +146,7 @@
 					<div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
 					<div class="flex items-center relative">
 						<select
-							class="dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							class="dark:bg-gray-900 cursor-pointer w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							bind:value={STT_ENGINE}
 							placeholder="Select an engine"
 						>
@@ -195,7 +202,7 @@
 					<div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
 					<div class="flex items-center relative">
 						<select
-							class=" dark:bg-gray-900 w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
+							class=" dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							bind:value={TTS_ENGINE}
 							placeholder="Select a mode"
 							on:change={async (e) => {
@@ -203,7 +210,7 @@
 								await getVoices();
 								await getModels();
-								if (e.target.value === 'openai') {
+								if (e.target?.value === 'openai') {
 									TTS_VOICE = 'alloy';
 									TTS_MODEL = 'tts-1';
 								} else {
@@ -351,6 +358,28 @@
 						</div>
 					</div>
 				{/if}
 				<hr class="dark:border-gray-850 my-2" />
 				<div class="pt-0.5 flex w-full justify-between">
 					<div class="self-center text-xs font-medium">{$i18n.t('Response splitting')}</div>
 					<div class="flex items-center relative">
 						<select
 							class="dark:bg-gray-900 w-fit pr-8 cursor-pointer rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 							placeholder="Select how to split response text"
 							bind:value={TTS_SPLIT_ON}
 						>
 						{#each Object.values(TTS_RESPONSE_SPLIT) as split}
 							<option value={split}>{$i18n.t(split.charAt(0).toUpperCase() + split.slice(1))}</option>
 						{/each}
 						</select>
 					</div>
 				</div>
 				<div class="mt-2 mb-1 text-xs text-gray-400 dark:text-gray-500">
 					{$i18n.t(
 						"Choose how to split response text for speech synthesis. 'Punctuation' splits by sentences, 'paragraphs' splits by paragraphs, and 'none' sends the response as a single string."
 					)}
 				</div>
 			</div>
 		</div>
 	</div>
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@@ -2,11 +2,10 @@
 	import { toast } from 'svelte-sonner';
 	import dayjs from 'dayjs';
 	import { fade } from 'svelte/transition';
 	import { createEventDispatcher } from 'svelte';
 	import { onMount, tick, getContext } from 'svelte';
-	const i18n = getContext('i18n');
+	const i18n = getContext<Writable<i18nType>>('i18n');
 	const dispatch = createEventDispatcher();
@@ -15,20 +14,18 @@
 	import { imageGenerations } from '$lib/apis/images';
 	import {
 		approximateToHumanReadable,
-		extractSentences,
+		extractParagraphsForAudio,
-		replaceTokens,
+		extractSentencesForAudio,
-		processResponseContent
+		prepareTextForTTS,
 	} from '$lib/utils';
 	import { WEBUI_BASE_URL } from '$lib/constants';
 	import Name from './Name.svelte';
 	import ProfileImage from './ProfileImage.svelte';
 	import Skeleton from './Skeleton.svelte';
 	import CodeBlock from './CodeBlock.svelte';
 	import Image from '$lib/components/common/Image.svelte';
 	import Tooltip from '$lib/components/common/Tooltip.svelte';
 	import RateComment from './RateComment.svelte';
 	import CitationsModal from '$lib/components/chat/Messages/CitationsModal.svelte';
 	import Spinner from '$lib/components/common/Spinner.svelte';
 	import WebSearchResults from './ResponseMessage/WebSearchResults.svelte';
 	import Sparkles from '$lib/components/icons/Sparkles.svelte';
@@ -36,7 +33,38 @@
 	import Error from './Error.svelte';
 	import Citations from './Citations.svelte';
-	export let message;
+	import type { Writable } from 'svelte/store';
 	import type { i18n as i18nType } from 'i18next';
 	import { TTS_RESPONSE_SPLIT } from '$lib/types';
 	interface MessageType {
 		id: string;
 		model: string;
 		content: string;
 		files?: { type: string; url: string }[];
 		timestamp: number;
 		role: string;
 		statusHistory?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; }[];
 		status?: { done: boolean; action: string; description: string; urls?: string[]; query?: string; };
 		done: boolean;
 		error?: boolean | { content: string };
 		citations?: string[];
 		info?: {
 			openai?: boolean;
 			prompt_tokens?: number;
 			completion_tokens?: number;
 			total_tokens?: number;
 			eval_count?: number;
 			eval_duration?: number;
 			prompt_eval_count?: number;
 			prompt_eval_duration?: number;
 			total_duration?: number;
 			load_duration?: number;
 		};
 		annotation?: { type: string; rating: number; };
 	}
 	export let message: MessageType;
 	export let siblings;
 	export let isLastMessage = true;
@@ -60,28 +88,33 @@
 	let editedContent = '';
 	let editTextAreaElement: HTMLTextAreaElement;
-	let sentencesAudio = {};
+	let audioParts: Record<number, HTMLAudioElement | null> = {};
-	let speaking = null;
+	let speaking = false;
-	let speakingIdx = null;
+	let speakingIdx: number | undefined;
 	let loadingSpeech = false;
 	let generatingImage = false;
 	let showRateComment = false;
-	const playAudio = (idx) => {
+	const playAudio = (idx: number) => {
-		return new Promise((res) => {
+		return new Promise<void>((res) => {
 			speakingIdx = idx;
-			const audio = sentencesAudio[idx];
+			const audio = audioParts[idx];
 			if (!audio) {
 				return res();
 			}
 			audio.play();
-			audio.onended = async (e) => {
+			audio.onended = async () => {
 				await new Promise((r) => setTimeout(r, 300));
-				if (Object.keys(sentencesAudio).length - 1 === idx) {
+				if (Object.keys(audioParts).length - 1 === idx) {
-					speaking = null;
+					speaking = false;
 				}
-				res(e);
+				res();
 			};
 		});
 	};
@@ -91,113 +124,119 @@
 			try {
 				speechSynthesis.cancel();
-				sentencesAudio[speakingIdx].pause();
+				if (speakingIdx !== undefined && audioParts[speakingIdx]) {
-				sentencesAudio[speakingIdx].currentTime = 0;
+					audioParts[speakingIdx]!.pause();
 					audioParts[speakingIdx]!.currentTime = 0;
 				}
 			} catch {}
-			speaking = null;
+			speaking = false;
-			speakingIdx = null;
+			speakingIdx = undefined;
-		} else {
+			return;
-			if ((message?.content ?? '').trim() !== '') {
+		}
 				speaking = true;
-				if ($config.audio.tts.engine !== '') {
+		if (!(message?.content ?? '').trim().length) {
-					loadingSpeech = true;
+			toast.info($i18n.t('No content to speak'));
 			return;
 		}
-					const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
+		speaking = true;
 						const lastIndex = mergedTexts.length - 1;
 						if (lastIndex >= 0) {
 							const previousText = mergedTexts[lastIndex];
 							const wordCount = previousText.split(/\s+/).length;
 							if (wordCount < 2) {
 								mergedTexts[lastIndex] = previousText + ' ' + currentText;
 							} else {
 								mergedTexts.push(currentText);
 							}
 						} else {
 							mergedTexts.push(currentText);
 						}
 						return mergedTexts;
 					}, []);
-					console.log(sentences);
+		if ($config.audio.tts.engine !== '') {
 			loadingSpeech = true;
-					if (sentences.length > 0) {
+			const preparedMessageContent: string[] = [];
 						sentencesAudio = sentences.reduce((a, e, i, arr) => {
 							a[i] = null;
 							return a;
 						}, {});
-						let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
+			switch ($config.audio.tts.split_on) {
-
+				default:
-						for (const [idx, sentence] of sentences.entries()) {
+				case TTS_RESPONSE_SPLIT.PUNCTUATION:
-							const res = await synthesizeOpenAISpeech(
+				preparedMessageContent.push(...extractSentencesForAudio(message.content));
-								localStorage.token,
+					break;
-								$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
+				case TTS_RESPONSE_SPLIT.PARAGRAPHS:
-									? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
+				preparedMessageContent.push(...extractParagraphsForAudio(message.content));
-									: $config?.audio?.tts?.voice,
+					break;
-								sentence
+				case TTS_RESPONSE_SPLIT.NONE:
-							).catch((error) => {
+				preparedMessageContent.push(prepareTextForTTS(message.content));
-								toast.error(error);
+					break;
 								speaking = null;
 								loadingSpeech = false;
 								return null;
 							});
 							if (res) {
 								const blob = await res.blob();
 								const blobUrl = URL.createObjectURL(blob);
 								const audio = new Audio(blobUrl);
 								sentencesAudio[idx] = audio;
 								loadingSpeech = false;
 								lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
 							}
 						}
 					} else {
 						speaking = null;
 						loadingSpeech = false;
 					}
 				} else {
 					let voices = [];
 					const getVoicesLoop = setInterval(async () => {
 						voices = await speechSynthesis.getVoices();
 						if (voices.length > 0) {
 							clearInterval(getVoicesLoop);
 							const voice =
 								voices
 									?.filter(
 										(v) =>
 											v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
 									)
 									?.at(0) ?? undefined;
 							console.log(voice);
 							const speak = new SpeechSynthesisUtterance(message.content);
 							console.log(speak);
 							speak.onend = () => {
 								speaking = null;
 								if ($settings.conversationMode) {
 									document.getElementById('voice-input-button')?.click();
 								}
 							};
 							if (voice) {
 								speak.voice = voice;
 							}
 							speechSynthesis.speak(speak);
 						}
 					}, 100);
 				}
 			} else {
 				toast.error($i18n.t('No content to speak'));
 			}
 			if (!preparedMessageContent.length) {
 				console.log('No content to speak');
 				toast.info($i18n.t('No content to speak'));
 				speaking = false;
 				loadingSpeech = false;
 				return;
 			}
 			console.debug('Prepared message content for TTS', preparedMessageContent);
 			audioParts = preparedMessageContent.reduce((acc, _sentence, idx) => {
 				acc[idx] = null;
 				return acc;
 			}, {} as typeof audioParts);
 			let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
 			for (const [idx, sentence] of preparedMessageContent.entries()) {
 				const res = await synthesizeOpenAISpeech(
 					localStorage.token,
 					$settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice
 						? ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
 						: $config?.audio?.tts?.voice,
 					sentence
 				).catch((error) => {
 					console.error(error);
 					toast.error(error);
 					speaking = false;
 					loadingSpeech = false;
 				});
 				if (res) {
 					const blob = await res.blob();
 					const blobUrl = URL.createObjectURL(blob);
 					const audio = new Audio(blobUrl);
 					audioParts[idx] = audio;
 					loadingSpeech = false;
 					lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
 				}
 			}
 		} else {
 			let voices = [];
 			const getVoicesLoop = setInterval(() => {
 				voices = speechSynthesis.getVoices();
 				if (voices.length > 0) {
 					clearInterval(getVoicesLoop);
 					const voice =
 						voices
 							?.filter(
 								(v) =>
 									v.voiceURI === ($settings?.audio?.tts?.voice ?? $config?.audio?.tts?.voice)
 							)
 							?.at(0) ?? undefined;
 					console.log(voice);
 					const speak = new SpeechSynthesisUtterance(message.content);
 					console.log(speak);
 					speak.onend = () => {
 						speaking = false;
 						if ($settings.conversationMode) {
 							document.getElementById('voice-input-button')?.click();
 						}
 					};
 					if (voice) {
 						speak.voice = voice;
 					}
 					speechSynthesis.speak(speak);
 				}
 			}, 100);
 		}
 	};
@@ -230,7 +269,7 @@
 		await tick();
 	};
-	const generateImage = async (message) => {
+	const generateImage = async (message: MessageType) => {
 		generatingImage = true;
 		const res = await imageGenerations(localStorage.token, message.content).catch((error) => {
 			toast.error(error);
@@ -285,7 +324,7 @@
 			</Name>
 			<div>
-				{#if (message?.files ?? []).filter((f) => f.type === 'image').length > 0}
+				{#if message?.files && message.files?.filter((f) => f.type === 'image').length > 0}
 					<div class="my-2.5 w-full flex overflow-x-auto gap-2 flex-wrap">
 						{#each message.files as file}
 							<div>
@@ -304,7 +343,7 @@
 								message?.statusHistory ?? [...(message?.status ? [message?.status] : [])]
 							).at(-1)}
 							<div class="flex items-center gap-2 pt-0.5 pb-1">
-								{#if status.done === false}
+								{#if status?.done === false}
 									<div class="">
 										<Spinner className="size-4" />
 									</div>
@@ -521,7 +560,7 @@
 											: 'invisible group-hover:visible'} p-1.5 hover:bg-black/5 dark:hover:bg-white/5 rounded-lg dark:hover:text-white hover:text-black transition"
 										on:click={() => {
 											if (!loadingSpeech) {
-												toggleSpeakMessage(message);
+												toggleSpeakMessage();
 											}
 										}}
 									>
@@ -661,7 +700,7 @@
 													`${
 														Math.round(
 															((message.info.eval_count ?? 0) /
-																(message.info.eval_duration / 1000000000)) *
+																((message.info.eval_duration ?? 0) / 1000000000)) *
 																100
 														) / 100
 													} tokens` ?? 'N/A'
@@ -669,7 +708,7 @@
 					prompt_token/s: ${
 						Math.round(
 							((message.info.prompt_eval_count ?? 0) /
-								(message.info.prompt_eval_duration / 1000000000)) *
+								((message.info.prompt_eval_duration ?? 0) / 1000000000)) *
 								100
 						) / 100 ?? 'N/A'
 					} tokens<br/>
@@ -688,7 +727,7 @@
 		            eval_duration: ${
 									Math.round(((message.info.eval_duration ?? 0) / 1000000) * 100) / 100 ?? 'N/A'
 								}ms<br/>
-		            approximate_total: ${approximateToHumanReadable(message.info.total_duration)}`}
+		            approximate_total: ${approximateToHumanReadable((message.info.total_duration ?? 0))}`}
 										placement="top"
 									>
 										<Tooltip content={$i18n.t('Generation Info')} placement="bottom">
--- a/src/lib/types/index.ts
+++ b/src/lib/types/index.ts
@@ -7,3 +7,9 @@ export type Banner = {
 	dismissible?: boolean;
 	timestamp: number;
 };
 export enum TTS_RESPONSE_SPLIT {
 	PUNCTUATION = 'punctuation',
 	PARAGRAPHS = 'paragraphs',
 	NONE = 'none',
 }
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@@ -408,7 +408,7 @@ const convertOpenAIMessages = (convo) => {
 	let currentId = '';
 	let lastId = null;
-	for (let message_id in mapping) {
+	for (const message_id in mapping) {
 		const message = mapping[message_id];
 		currentId = message_id;
 		try {
@@ -442,7 +442,7 @@ const convertOpenAIMessages = (convo) => {
 		}
 	}
-	let history = {};
+	const history: Record<PropertyKey, (typeof messages)[number]> = {};
 	messages.forEach((obj) => (history[obj.id] = obj));
 	const chat = {
@@ -481,7 +481,7 @@ const validateChat = (chat) => {
 	}
 	// Every message's content should be a string
-	for (let message of messages) {
+	for (const message of messages) {
 		if (typeof message.content !== 'string') {
 			return false;
 		}
@@ -494,7 +494,7 @@ export const convertOpenAIChats = (_chats) => {
 	// Create a list of dictionaries with each conversation from import
 	const chats = [];
 	let failed = 0;
-	for (let convo of _chats) {
+	for (const convo of _chats) {
 		const chat = convertOpenAIMessages(convo);
 		if (validateChat(chat)) {
@@ -513,7 +513,7 @@ export const convertOpenAIChats = (_chats) => {
 	return chats;
 };
-export const isValidHttpUrl = (string) => {
+export const isValidHttpUrl = (string: string) => {
 	let url;
 	try {
@@ -525,7 +525,7 @@ export const isValidHttpUrl = (string) => {
 	return url.protocol === 'http:' || url.protocol === 'https:';
 };
-export const removeEmojis = (str) => {
+export const removeEmojis = (str: string) => {
 	// Regular expression to match emojis
 	const emojiRegex = /[\uD800-\uDBFF][\uDC00-\uDFFF]|\uD83C[\uDC00-\uDFFF]|\uD83D[\uDC00-\uDE4F]/g;
@@ -533,20 +533,24 @@ export const removeEmojis = (str) => {
 	return str.replace(emojiRegex, '');
 };
-export const removeFormattings = (str) => {
+export const removeFormattings = (str: string) => {
 	return str.replace(/(\*)(.*?)\1/g, '').replace(/(```)(.*?)\1/gs, '');
 };
-export const extractSentences = (text) => {
+export const prepareTextForTTS = (content: string) => {
-	// This regular expression matches code blocks marked by triple backticks
+	return removeFormattings(removeEmojis(content.trim()));
-	const codeBlockRegex = /```[\s\S]*?```/g;
+};
-	let codeBlocks = [];
+// This regular expression matches code blocks marked by triple backticks
 const codeBlockRegex = /```[\s\S]*?```/g;
 export const extractSentences = (text: string) => {
 	const codeBlocks: string[] = [];
 	let index = 0;
 	// Temporarily replace code blocks with placeholders and store the blocks separately
 	text = text.replace(codeBlockRegex, (match) => {
-		let placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
+		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
 		codeBlocks[index++] = match;
 		return placeholder;
 	});
@@ -561,11 +565,36 @@ export const extractSentences = (text) => {
 	});
 	return sentences
-		.map((sentence) => removeFormattings(removeEmojis(sentence.trim())))
+		.map(prepareTextForTTS)
-		.filter((sentence) => sentence);
+		.filter(Boolean);
 };
-export const extractSentencesForAudio = (text) => {
+export const extractParagraphsForAudio = (text: string) => {
 	const codeBlocks: string[] = [];
 	let index = 0;
 	// Temporarily replace code blocks with placeholders and store the blocks separately
 	text = text.replace(codeBlockRegex, (match) => {
 		const placeholder = `\u0000${index}\u0000`; // Use a unique placeholder
 		codeBlocks[index++] = match;
 		return placeholder;
 	});
 	// Split the modified text into paragraphs based on newlines, avoiding these blocks
 	let paragraphs = text.split(/\n+/);
 	// Restore code blocks and process paragraphs
 	paragraphs = paragraphs.map((paragraph) => {
 		// Check if the paragraph includes a placeholder for a code block
 		return paragraph.replace(/\u0000(\d+)\u0000/g, (_, idx) => codeBlocks[idx]);
 	});
 	return paragraphs
 		.map(prepareTextForTTS)
 		.filter(Boolean);
 };
 export const extractSentencesForAudio = (text: string) => {
 	return extractSentences(text).reduce((mergedTexts, currentText) => {
 		const lastIndex = mergedTexts.length - 1;
 		if (lastIndex >= 0) {
@@ -580,7 +609,7 @@ export const extractSentencesForAudio = (text) => {
 			mergedTexts.push(currentText);
 		}
 		return mergedTexts;
-	}, []);
+	}, [] as string[]);
 };
 export const blobToFile = (blob, fileName) => {