feat: chat supports voice input (#276)

* feat: chat supports voice input * refactor: hide window out of focus * feat: search supports voice input
2025-12-16 03:27:43 +01:00 · 2025-03-11 16:36:51 +08:00
parent 44a3ea3868
commit 0044e9a536
5 changed files with 104 additions and 19 deletions
--- a/package.json
+++ b/package.json
@@ -57,6 +57,7 @@
  },
  "devDependencies": {
    "@tauri-apps/cli": "^2.3.1",
+    "@types/dom-speech-recognition": "^0.0.4",
    "@types/lodash-es": "^4.17.12",
    "@types/markdown-it": "^14.1.2",
    "@types/node": "^22.13.10",
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -135,6 +135,9 @@ importers:
      '@tauri-apps/cli':
        specifier: ^2.3.1
        version: 2.3.1
+      '@types/dom-speech-recognition':
+        specifier: ^0.0.4
+        version: 0.0.4
      '@types/lodash-es':
        specifier: ^4.17.12
        version: 4.17.12
@@ -1230,6 +1233,9 @@ packages:
  '@types/debug@4.1.12':
    resolution: {integrity: sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==}

+  '@types/dom-speech-recognition@0.0.4':
+    resolution: {integrity: sha512-zf2GwV/G6TdaLwpLDcGTIkHnXf8JEf/viMux+khqKQKDa8/8BAUtXXZS563GnvJ4Fg0PBLGAaFf2GekEVSZ6GQ==}
+
  '@types/estree-jsx@1.0.5':
    resolution: {integrity: sha512-52CcUVNFyfb1A2ALocQw/Dd1BQFNmSdkuC3BkZ6iqhdMfQz7JWOFRuJFloOzjk+6WijU56m9oKXFAXc7o3Towg==}

@@ -4320,6 +4326,8 @@ snapshots:
    dependencies:
      '@types/ms': 2.1.0

+  '@types/dom-speech-recognition@0.0.4': {}
+
  '@types/estree-jsx@1.0.5':
    dependencies:
      '@types/estree': 1.0.6
--- a/src-tauri/Info.plist
+++ b/src-tauri/Info.plist
@@ -31,5 +31,12 @@
      </array>
    </dict>
  </array>
+  
+  <key>NSMicrophoneUsageDescription</key>
+  <string>Coco AI needs access to your microphone for voice input and audio recording features.</string>
+  <key>NSCameraUsageDescription</key>
+  <string>Coco AI requires camera access for scanning documents and capturing images.</string>
+  <key>NSSpeechRecognitionUsageDescription</key>
+  <string>Coco AI uses speech recognition to convert your voice into text for a hands-free experience.</string>
 </dict>
 </plist>
--- a/src/components/Search/InputBox.tsx
+++ b/src/components/Search/InputBox.tsx
@@ -1,4 +1,4 @@
-import { ArrowBigLeft, Search, Send, Brain } from "lucide-react";
+import { ArrowBigLeft, Search, Send, Brain, AudioLines } from "lucide-react";
 import { useCallback, useEffect, useRef, useState } from "react";
 import { listen } from "@tauri-apps/api/event";
 import { invoke, isTauri } from "@tauri-apps/api/core";
@@ -13,6 +13,7 @@ import { useAppStore } from "@/stores/appStore";
 import { useSearchStore } from "@/stores/searchStore";
 import { metaOrCtrlKey } from "@/utils/keyboardUtils";
 import SearchPopover from "./SearchPopover";
+import SpeechToText from "../SpeechToText";

 interface ChatInputProps {
  onSend: (message: string) => void;
@@ -281,23 +282,13 @@ export default function ChatInput({
          ) : null}
        </div>

-        {/* {isChatMode ? (
-          <button
-            className={`p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition-colors ${
-              isListening ? "bg-blue-100 dark:bg-blue-900" : ""
-            }`}
-            type="button"
-            onClick={() => {}}
-          >
-            <Mic
-              className={`w-4 h-4 ${
-                isListening
-                  ? "text-blue-500 animate-pulse"
-                  : "text-[#999] dark:text-[#999]"
-              }`}
-            />
-          </button>
-        ) : null} */}
+        {isChatMode && (
+          <SpeechToText
+            onChange={(transcript) => {
+              changeInput(inputValue + transcript);
+            }}
+          />
+        )}

        {isChatMode && curChatEnd ? (
          <button
@@ -396,7 +387,14 @@ export default function ChatInput({
            />
          </div>
        ) : (
-          <div className="w-28 flex gap-2 relative"></div>
+          <div data-tauri-drag-region className="w-28 flex gap-2 relative">
+            <SpeechToText
+              Icon={AudioLines}
+              onChange={(transcript) => {
+                changeInput(inputValue + transcript);
+              }}
+            />
+          </div>
        )}

        {isChatPage ? null : (
--- a/src/components/SpeechToText/index.tsx
+++ b/src/components/SpeechToText/index.tsx
@@ -0,0 +1,71 @@
+import { useReactive } from "ahooks";
+import clsx from "clsx";
+import { Mic } from "lucide-react";
+import { ComponentType, FC } from "react";
+
+interface SpeechToTextProps {
+  Icon?: ComponentType<any>;
+  onChange?: (transcript: string) => void;
+}
+
+interface State {
+  speaking: boolean;
+  transcript: string;
+}
+
+let recognition: SpeechRecognition;
+
+const SpeechToText: FC<SpeechToTextProps> = (props) => {
+  const { Icon = Mic, onChange } = props;
+
+  const state = useReactive<State>({
+    speaking: false,
+    transcript: "",
+  });
+
+  const handleSpeak = async () => {
+    if (state.speaking) {
+      state.speaking = false;
+
+      return recognition.stop();
+    }
+
+    const SpeechRecognition =
+      window.SpeechRecognition || window.webkitSpeechRecognition;
+
+    recognition = new SpeechRecognition();
+    recognition.continuous = true;
+    recognition.interimResults = true;
+    recognition.lang = "zh-CN";
+
+    recognition.onresult = (event) => {
+      state.transcript = event.results[0][0].transcript;
+
+      onChange?.(state.transcript);
+    };
+
+    recognition.start();
+
+    state.speaking = true;
+  };
+
+  return (
+    <div
+      className={clsx(
+        "p-1 hover:bg-gray-50 dark:hover:bg-gray-700 rounded-full transition",
+        {
+          "bg-blue-100 dark:bg-blue-900": state.speaking,
+        }
+      )}
+    >
+      <Icon
+        className={clsx("size-4 text-[#999] dark:text-[#999]", {
+          "text-blue-500 animate-pulse": state.speaking,
+        })}
+        onClick={handleSpeak}
+      />
+    </div>
+  );
+};
+
+export default SpeechToText;