Merge branch 'main' of github.com:Rongjiehuang/AudioGPT

2025-12-16 03:47:55 +01:00 · 2023-04-09 17:02:47 +08:00
parent f3cf2be08c 0dff745f30
commit 028bf0c510
7 changed files with 30 additions and 1 deletions
--- a/assets/README.md
+++ b/assets/README.md
@@ -71,3 +71,32 @@ Output:<br />
 Input Example : Generate an image of a horse<br />
 Output:<br />
 ![](t2i.png)<br />
+
+## Sound Detection
+First upload your audio(.wav)<br />
+Audio Example :<br />
+<audio src="mix.wav" controls></audio><br />
+Input Example : What events does this audio include?<br />
+Output:<br />
+![](detection.png)<br />
+
+## Mono audio to Binaural Audio
+First upload your audio(.wav)<br />
+<audio src="mix.wav" controls></audio><br />
+Input Example: Transfer the mono speech to a binaural one audio.<br />
+Output:<br />
+![](m2b.png)<br />
+
+## Target Sound Detection
+Fisrt upload your audio(.wav)<br />
+<audio src="mix.wav" controls></audio><br />
+Input Example: please help me detect the target sound in the audio based on desription: “I want to detect Applause event”<br />
+Output:<br />
+![](tsd.png)<br />
+
+## Sound Extraction
+First upload your audio(.wav)<br />
+<audio src="mix.wav" controls></audio><br />
+Input Example: Please help me extract the sound events from the audio based on the description: "a person shouts nearby and then emergency vehicle sirens sounds"<br />
+Output:<br />
+![](sound_extraction.png)<br />
--- a/assets/detection.png
+++ b/assets/detection.png
--- a/assets/m2b.png
+++ b/assets/m2b.png
--- a/assets/mix1.wav
+++ b/assets/mix1.wav
--- a/assets/sound_extraction.png
+++ b/assets/sound_extraction.png
--- a/assets/tsd.png
+++ b/assets/tsd.png
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -843,7 +843,7 @@ class ConversationBot:
            Tool(name="Extract sound event from mixture audio based on language description", func=self.extraction.inference,
                 description="useful for when you extract target sound from a mixture audio, you can describe the taregt sound by text, receives audio_path and text as input. "
                             "The input to this tool should be a comma seperated string of two, representing mixture audio path and input text."),
-            Tool(name="Detect the sound event from the audio based on your descriptions", func=self.TSD.inference,
+            Tool(name="Detect the target sound event from the audio based on your descriptions", func=self.TSD.inference,
                 description="useful for when you want to know the when happens the target sound event in th audio. You can use language descriptions to instruct the model. receives text description and audio_path as input. "
                             "The input to this tool should be a string, representing the answer. ")]
        self.agent = initialize_agent(