feat: chat supports voice input (#276)

* feat: chat supports voice input

* refactor: hide window out of focus

* feat: search supports voice input
This commit is contained in:
ayangweb
2025-03-11 16:36:51 +08:00
committed by GitHub
parent 44a3ea3868
commit 0044e9a536
5 changed files with 104 additions and 19 deletions

View File

@@ -57,6 +57,7 @@
},
"devDependencies": {
"@tauri-apps/cli": "^2.3.1",
"@types/dom-speech-recognition": "^0.0.4",
"@types/lodash-es": "^4.17.12",
"@types/markdown-it": "^14.1.2",
"@types/node": "^22.13.10",

8
pnpm-lock.yaml generated
View File

@@ -135,6 +135,9 @@ importers:
'@tauri-apps/cli':
specifier: ^2.3.1
version: 2.3.1
'@types/dom-speech-recognition':
specifier: ^0.0.4
version: 0.0.4
'@types/lodash-es':
specifier: ^4.17.12
version: 4.17.12
@@ -1230,6 +1233,9 @@ packages:
'@types/debug@4.1.12':
resolution: {integrity: sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==}
'@types/dom-speech-recognition@0.0.4':
resolution: {integrity: sha512-zf2GwV/G6TdaLwpLDcGTIkHnXf8JEf/viMux+khqKQKDa8/8BAUtXXZS563GnvJ4Fg0PBLGAaFf2GekEVSZ6GQ==}
'@types/estree-jsx@1.0.5':
resolution: {integrity: sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==}
@@ -4320,6 +4326,8 @@ snapshots:
dependencies:
'@types/ms': 2.1.0
'@types/dom-speech-recognition@0.0.4': {}
'@types/estree-jsx@1.0.5':
dependencies:
'@types/estree': 1.0.6

View File

@@ -31,5 +31,12 @@
</array>
</dict>
</array>
<key>NSMicrophoneUsageDescription</key>
<string>Coco AI needs access to your microphone for voice input and audio recording features.</string>
<key>NSCameraUsageDescription</key>
<string>Coco AI requires camera access for scanning documents and capturing images.</string>
<key>NSSpeechRecognitionUsageDescription</key>
<string>Coco AI uses speech recognition to convert your voice into text for a hands-free experience.</string>
</dict>
</plist>

View File

@@ -1,4 +1,4 @@
import { ArrowBigLeft, Search, Send, Brain } from "lucide-react";
import { ArrowBigLeft, Search, Send, Brain, AudioLines } from "lucide-react";
import { useCallback, useEffect, useRef, useState } from "react";
import { listen } from "@tauri-apps/api/event";
import { invoke, isTauri } from "@tauri-apps/api/core";
@@ -13,6 +13,7 @@ import { useAppStore } from "@/stores/appStore";
import { useSearchStore } from "@/stores/searchStore";
import { metaOrCtrlKey } from "@/utils/keyboardUtils";
import SearchPopover from "./SearchPopover";
import SpeechToText from "../SpeechToText";
interface ChatInputProps {
onSend: (message: string) => void;
@@ -281,23 +282,13 @@ export default function ChatInput({
) : null}
</div>
{/* {isChatMode ? (
<button
className={`p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition-colors ${
isListening ? "bg-blue-100 dark:bg-blue-900" : ""
}`}
type="button"
onClick={() => {}}
>
<Mic
className={`w-4 h-4 ${
isListening
? "text-blue-500 animate-pulse"
: "text-[#999] dark:text-[#999]"
}`}
/>
</button>
) : null} */}
{isChatMode && (
<SpeechToText
onChange={(transcript) => {
changeInput(inputValue + transcript);
}}
/>
)}
{isChatMode && curChatEnd ? (
<button
@@ -396,7 +387,14 @@ export default function ChatInput({
/>
</div>
) : (
<div className="w-28 flex gap-2 relative"></div>
<div data-tauri-drag-region className="w-28 flex gap-2 relative">
<SpeechToText
Icon={AudioLines}
onChange={(transcript) => {
changeInput(inputValue + transcript);
}}
/>
</div>
)}
{isChatPage ? null : (

View File

@@ -0,0 +1,71 @@
import { useReactive } from "ahooks";
import clsx from "clsx";
import { Mic } from "lucide-react";
import { ComponentType, FC } from "react";
interface SpeechToTextProps {
Icon?: ComponentType<any>;
onChange?: (transcript: string) => void;
}
interface State {
speaking: boolean;
transcript: string;
}
let recognition: SpeechRecognition;
const SpeechToText: FC<SpeechToTextProps> = (props) => {
const { Icon = Mic, onChange } = props;
const state = useReactive<State>({
speaking: false,
transcript: "",
});
const handleSpeak = async () => {
if (state.speaking) {
state.speaking = false;
return recognition.stop();
}
const SpeechRecognition =
window.SpeechRecognition || window.webkitSpeechRecognition;
recognition = new SpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = "zh-CN";
recognition.onresult = (event) => {
state.transcript = event.results[0][0].transcript;
onChange?.(state.transcript);
};
recognition.start();
state.speaking = true;
};
return (
<div
className={clsx(
"p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition",
{
"bg-blue-100 dark:bg-blue-900": state.speaking,
}
)}
>
<Icon
className={clsx("size-4 text-[#999] dark:text-[#999]", {
"text-blue-500 animate-pulse": state.speaking,
})}
onClick={handleSpeak}
/>
</div>
);
};
export default SpeechToText;