feat: voice input support for search and chat (#302)

* feat: voice input support for search and chat

* chore: add mic-recorder plugin

* refactor: check microphone permission before recording

* feat: realize sound wave effects

* chore: remove mic-recorder plugin
This commit is contained in:
ayangweb
2025-03-24 09:17:09 +08:00
committed by GitHub
parent ef1304ce5e
commit 118eaa55e3
9 changed files with 873 additions and 834 deletions

View File

@@ -19,7 +19,7 @@
},
"dependencies": {
"@headlessui/react": "^2.2.0",
"@tauri-apps/api": "^2.3.0",
"@tauri-apps/api": "^2.4.0",
"@tauri-apps/plugin-autostart": "~2.2.0",
"@tauri-apps/plugin-deep-link": "^2.2.0",
"@tauri-apps/plugin-dialog": "^2.2.0",
@@ -28,9 +28,10 @@
"@tauri-apps/plugin-os": "^2.2.1",
"@tauri-apps/plugin-process": "^2.2.0",
"@tauri-apps/plugin-shell": "^2.2.0",
"@tauri-apps/plugin-updater": "^2.6.0",
"@tauri-apps/plugin-updater": "^2.6.1",
"@tauri-apps/plugin-websocket": "~2.3.0",
"@tauri-apps/plugin-window": "2.0.0-alpha.1",
"@wavesurfer/react": "^1.0.9",
"ahooks": "^3.8.4",
"clsx": "^2.1.1",
"dotenv": "^16.4.7",
@@ -39,8 +40,8 @@
"i18next-browser-languagedetector": "^8.0.4",
"lodash-es": "^4.17.21",
"lucide-react": "^0.461.0",
"mermaid": "^11.4.1",
"nanoid": "^5.1.3",
"mermaid": "^11.5.0",
"nanoid": "^5.1.5",
"react": "^18.3.1",
"react-dom": "^18.3.1",
"react-hotkeys-hook": "^4.6.1",
@@ -54,19 +55,20 @@
"remark-gfm": "^4.0.1",
"remark-math": "^6.0.0",
"tauri-plugin-fs-pro-api": "^2.3.1",
"tauri-plugin-macos-permissions-api": "^2.1.1",
"tauri-plugin-macos-permissions-api": "^2.2.0",
"tauri-plugin-screenshots-api": "^2.1.0",
"use-debounce": "^10.0.4",
"uuid": "^11.1.0",
"wavesurfer.js": "^7.9.3",
"zustand": "^5.0.3"
},
"devDependencies": {
"@tauri-apps/cli": "^2.3.1",
"@tauri-apps/cli": "^2.4.0",
"@types/dom-speech-recognition": "^0.0.4",
"@types/lodash-es": "^4.17.12",
"@types/markdown-it": "^14.1.2",
"@types/node": "^22.13.10",
"@types/react": "^18.3.18",
"@types/node": "^22.13.11",
"@types/react": "^18.3.19",
"@types/react-dom": "^18.3.5",
"@types/react-katex": "^3.0.4",
"@types/react-window": "^1.8.8",

830
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

487
src-tauri/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,180 @@
import { useAppStore } from "@/stores/appStore";
import { useReactive } from "ahooks";
import clsx from "clsx";
import { Check, Loader, Mic, X } from "lucide-react";
import { FC, useEffect, useRef } from "react";
import {
checkMicrophonePermission,
requestMicrophonePermission,
} from "tauri-plugin-macos-permissions-api";
import { useWavesurfer } from "@wavesurfer/react";
import RecordPlugin from "wavesurfer.js/dist/plugins/record.esm.js";
import { pick } from "lodash-es";
interface AudioRecordingProps {
onChange?: (text: string) => void;
}
interface State {
isRecording: boolean;
converting: boolean;
countdown: number;
}
const INITIAL_STATE: State = {
isRecording: false,
converting: false,
countdown: 30,
};
let interval: ReturnType<typeof setInterval>;
const AudioRecording: FC<AudioRecordingProps> = (props) => {
const { onChange } = props;
const state = useReactive({ ...INITIAL_STATE });
const containerRef = useRef<HTMLDivElement>(null);
const recordRef = useRef<RecordPlugin>();
const withVisibility = useAppStore((state) => state.withVisibility);
const { wavesurfer } = useWavesurfer({
container: containerRef,
height: 20,
waveColor: "#0072ff",
progressColor: "#999",
barWidth: 4,
barRadius: 4,
barGap: 2,
});
useEffect(() => {
if (!wavesurfer) return;
const record = wavesurfer.registerPlugin(
RecordPlugin.create({
scrollingWaveform: true,
renderRecordedAudio: false,
})
);
record.on("record-end", (blob) => {
const recordedUrl = URL.createObjectURL(blob);
console.log("recorded:", recordedUrl);
// setAudioUrl(recordedUrl);
});
recordRef.current = record;
return resetState;
}, [wavesurfer]);
useEffect(() => {
if (!state.isRecording) return;
interval = setInterval(() => {
if (state.countdown <= 0) {
handleOk();
}
state.countdown--;
}, 1000);
}, [state.isRecording]);
const resetState = (otherState: Partial<State> = {}) => {
clearInterval(interval);
recordRef.current?.stopRecording();
Object.assign(state, { ...INITIAL_STATE, ...otherState });
};
const checkPermission = async () => {
const authorized = await checkMicrophonePermission();
if (authorized) return;
requestMicrophonePermission();
return new Promise(async (resolved) => {
const timer = setInterval(async () => {
const authorized = await checkMicrophonePermission();
if (!authorized) return;
clearInterval(timer);
resolved(true);
}, 500);
});
};
const startRecording = async () => {
await withVisibility(checkPermission);
state.isRecording = true;
recordRef.current?.startRecording();
};
const handleOk = () => {
resetState({ converting: true, countdown: state.countdown });
setTimeout(() => {
onChange?.("");
resetState();
}, 3000);
};
return (
<>
<div
className={clsx(
"p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition cursor-pointer"
)}
>
<Mic className="size-4 text-[#999]" onClick={startRecording} />
</div>
<div
className={clsx(
"absolute inset-0 flex items-center gap-1 px-1 rounded translate-x-full transition-all bg-[#ededed] dark:bg-[#202126]",
{
"!translate-x-0": state.isRecording || state.converting,
}
)}
>
<button
disabled={state.converting}
className={clsx(
"flex items-center justify-center size-6 bg-white dark:bg-black rounded-full transition cursor-pointer",
{
"!cursor-not-allowed opacity-50": state.converting,
}
)}
onClick={() => resetState()}
>
<X className="size-4 text-[#0C0C0C] dark:text-[#999999]" />
</button>
<div className="flex items-center gap-1 flex-1 h-6 px-2 bg-white dark:bg-black rounded-full transition">
<div ref={containerRef} className="flex-1"></div>
<span className="text-xs text-[#333] dark:text-[#999]">
{state.countdown}
</span>
</div>
<button
disabled={state.converting}
className="flex items-center justify-center size-6 text-white bg-[#0072FF] rounded-full transition cursor-pointer"
onClick={handleOk}
>
{state.converting ? (
<Loader className="size-4 animate-spin" />
) : (
<Check className="size-4" />
)}
</button>
</div>
</>
);
};
export default AudioRecording;

View File

@@ -11,8 +11,8 @@ import { useAppStore } from "@/stores/appStore";
import { useSearchStore } from "@/stores/searchStore";
import { metaOrCtrlKey } from "@/utils/keyboardUtils";
import SearchPopover from "./SearchPopover";
// import SpeechToText from "../SpeechToText";
import { DataSource } from "@/components/Assistant/types";
import AudioRecording from "../AudioRecording";
interface ChatInputProps {
onSend: (message: string) => void;
@@ -61,16 +61,16 @@ export default function ChatInput({
getDataSourcesByServer,
setupWindowFocusListener,
hideCoco,
// checkScreenPermission,
// requestScreenPermission,
// getScreenMonitors,
// getScreenWindows,
// captureMonitorScreenshot,
// captureWindowScreenshot,
// openFileDialog,
// getFileMetadata,
// getFileIcon,
}: ChatInputProps) {
}: // checkScreenPermission,
// requestScreenPermission,
// getScreenMonitors,
// getScreenWindows,
// captureMonitorScreenshot,
// captureWindowScreenshot,
// openFileDialog,
// getFileMetadata,
// getFileIcon,
ChatInputProps) {
const { t } = useTranslation();
const showTooltip = useAppStore(
@@ -229,11 +229,9 @@ export default function ChatInput({
};
return (
<div
className={`w-full relative`}
>
<div className={`w-full relative`}>
<div
className={`p-2 flex items-center dark:text-[#D8D8D8] bg-[#ededed] dark:bg-[#202126] rounded transition-all relative `}
className={`p-2 flex items-center dark:text-[#D8D8D8] bg-[#ededed] dark:bg-[#202126] rounded transition-all relative overflow-hidden`}
>
<div className="flex flex-wrap gap-2 flex-1 items-center relative">
{!isChatMode && !sourceData ? (
@@ -294,13 +292,12 @@ export default function ChatInput({
) : null}
</div>
{/* {isChatMode && (
<SpeechToText
onChange={(transcript) => {
changeInput(inputValue + transcript);
}}
/>
)} */}
<AudioRecording
key={isChatMode ? "chat" : "search"}
onChange={(text) => {
changeInput(inputValue + text);
}}
/>
{isChatMode && curChatEnd ? (
<button
@@ -410,14 +407,10 @@ export default function ChatInput({
/>
</div>
) : (
<div data-tauri-drag-region className="w-28 flex gap-2 relative">
{/* <SpeechToText
Icon={AudioLines}
onChange={(transcript) => {
changeInput(inputValue + transcript);
}}
/> */}
</div>
<div
data-tauri-drag-region
className="w-28 flex gap-2 relative"
></div>
)}
{isChatPage ? null : (

View File

@@ -40,7 +40,9 @@ interface InputExtraProps {
getScreenWindows: () => Promise<any[]>;
captureMonitorScreenshot: (id: number) => Promise<string>;
captureWindowScreenshot: (id: number) => Promise<string>;
openFileDialog: (options: { multiple: boolean }) => Promise<string | string[] | null>;
openFileDialog: (options: {
multiple: boolean;
}) => Promise<string | string[] | null>;
getFileMetadata: (path: string) => Promise<any>;
getFileIcon: (path: string, size: number) => Promise<string>;
}
@@ -59,7 +61,7 @@ const InputExtra = ({
const { t, i18n } = useTranslation();
const uploadFiles = useChatStore((state) => state.uploadFiles);
const setUploadFiles = useChatStore((state) => state.setUploadFiles);
const setIsPinned = useAppStore((state) => state.setIsPinned);
const withVisibility = useAppStore((state) => state.withVisibility);
const state = useReactive<State>({
screenshotableMonitors: [],
@@ -98,14 +100,12 @@ const InputExtra = ({
{
label: t("search.input.uploadFile"),
clickEvent: async () => {
setIsPinned(true);
const selectedFiles = await openFileDialog({
multiple: true,
const selectedFiles = await withVisibility(() => {
return openFileDialog({
multiple: true,
});
});
setIsPinned(false);
if (isNil(selectedFiles)) return;
handleUploadFiles(selectedFiles);

View File

@@ -1,97 +0,0 @@
import { useEventListener, useReactive } from "ahooks";
import clsx from "clsx";
import { LucideIcon, Mic } from "lucide-react";
import { FC, useEffect } from "react";
interface SpeechToTextProps {
Icon?: LucideIcon;
onChange?: (transcript: string) => void;
}
let recognition: SpeechRecognition | null = null;
const SpeechToText: FC<SpeechToTextProps> = (props) => {
const { Icon = Mic, onChange } = props;
const state = useReactive({
speaking: false,
});
useEffect(() => {
return destroyRecognition;
}, []);
useEventListener("focusin", (event) => {
const { target } = event;
const isInputElement =
target instanceof HTMLInputElement ||
target instanceof HTMLTextAreaElement;
if (state.speaking && isInputElement) {
target.blur();
}
});
const handleSpeak = () => {
if (state.speaking) {
return destroyRecognition();
}
const SpeechRecognition =
window.SpeechRecognition || window.webkitSpeechRecognition;
recognition = new SpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = "zh-CN";
recognition.onresult = (event) => {
const transcript = [...event.results]
.map((result) => result[0].transcript)
.join("");
onChange?.(transcript);
};
recognition.onerror = destroyRecognition;
recognition.onend = destroyRecognition;
recognition.start();
state.speaking = true;
};
const destroyRecognition = () => {
if (recognition) {
recognition.abort();
recognition.onresult = null;
recognition.onerror = null;
recognition.onend = null;
recognition = null;
}
state.speaking = false;
};
return (
<div
className={clsx(
"p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition cursor-pointer",
{
"bg-blue-100 dark:bg-blue-900": state.speaking,
}
)}
>
<Icon
className={clsx("size-4 text-[#999] dark:text-[#999]", {
"text-blue-500 animate-pulse": state.speaking,
})}
onClick={handleSpeak}
/>
</div>
);
};
export default SpeechToText;

View File

@@ -1,15 +1,16 @@
import { useEffect } from 'react';
import { useEffect } from "react";
import { useAppStore } from '@/stores/appStore';
import platformAdapter from '@/utils/platformAdapter';
import { useAppStore } from "@/stores/appStore";
import platformAdapter from "@/utils/platformAdapter";
export function useWindowEvents() {
const isPinned = useAppStore((state) => state.isPinned);
const visible = useAppStore((state) => state.visible);
useEffect(() => {
const handleBlur = async () => {
console.log("Window blurred");
if (isPinned) {
if (isPinned || visible) {
return;
}
@@ -23,5 +24,5 @@ export function useWindowEvents() {
return () => {
window.removeEventListener("blur", handleBlur);
};
}, [isPinned]);
}
}, [isPinned, visible]);
}

View File

@@ -51,6 +51,9 @@ export type IAppStore = {
showCocoShortcuts: string[];
setShowCocoShortcuts: (showCocoShortcuts: string[]) => void;
visible: boolean;
withVisibility: <T>(fn: () => Promise<T>) => Promise<T>;
};
export const useAppStore = create<IAppStore>()(
@@ -104,6 +107,16 @@ export const useAppStore = create<IAppStore>()(
return set({ showCocoShortcuts });
},
visible: false,
withVisibility: async <T>(fn: () => Promise<T>) => {
set({ visible: true });
const result = await fn();
set({ visible: false });
return result;
},
}),
{
name: "app-store",