detection and extraction

This commit is contained in:
yangdongchao
2023-04-06 00:11:23 +08:00
parent 7ee017cf0d
commit 322ed8cbb2
37 changed files with 11554 additions and 3 deletions

View File

@@ -6,6 +6,8 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Neura
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
import gradio as gr
import matplotlib
import librosa
@@ -37,7 +39,18 @@ from inference.tts.PortaSpeech import TTSInference
from utils.hparams import set_hparams
from utils.hparams import hparams as hp
import scipy.io.wavfile as wavfile
import librosa
from audio_infer.utils import config as detection_config
from audio_infer.pytorch.models import PVT
from src.models import BinauralNetwork
from sound_extraction.model.LASSNet import LASSNet
from sound_extraction.utils.stft import STFT
from sound_extraction.utils.wav_io import load_wav, save_wav
from target_sound_detection.src import models as tsd_models
from target_sound_detection.src.models import event_labels
from target_sound_detection.src.utils import median_filter, decode_with_timestamps
import clip
import numpy as np
AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
AUdio ChatGPT can not directly read audios, but it has a list of tools to finish different audio synthesis tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, Audio ChatGPT is very strict to the file name and will never fabricate nonexistent files.
AUdio ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
@@ -511,6 +524,261 @@ class A2T:
caption_text = self.model(audio)
return caption_text[0]
class SoundDetection:
def __init__(self, device):
self.device = device
self.sample_rate = 32000
self.window_size = 1024
self.hop_size = 320
self.mel_bins = 64
self.fmin = 50
self.fmax = 14000
self.model_type = 'PVT'
self.checkpoint_path = 'audio_detection/audio_infer/useful_ckpts/audio_detection.pth'
self.classes_num = detection_config.classes_num
self.labels = detection_config.labels
self.frames_per_second = self.sample_rate // self.hop_size
# Model = eval(self.model_type)
self.model = PVT(sample_rate=self.sample_rate, window_size=self.window_size,
hop_size=self.hop_size, mel_bins=self.mel_bins, fmin=self.fmin, fmax=self.fmax,
classes_num=self.classes_num)
checkpoint = torch.load(self.checkpoint_path, map_location=self.device)
self.model.load_state_dict(checkpoint['model'])
self.model.to(device)
def inference(self, audio_path):
# Forward
(waveform, _) = librosa.core.load(audio_path, sr=self.sample_rate, mono=True)
waveform = waveform[None, :] # (1, audio_length)
waveform = torch.from_numpy(waveform)
waveform = waveform.to(self.device)
# Forward
with torch.no_grad():
self.model.eval()
batch_output_dict = self.model(waveform, None)
framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
"""(time_steps, classes_num)"""
# print('Sound event detection result (time_steps x classes_num): {}'.format(
# framewise_output.shape))
import numpy as np
import matplotlib.pyplot as plt
sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
top_k = 10 # Show top results
top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]
"""(time_steps, top_k)"""
# Plot result
stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=self.window_size,
hop_length=self.hop_size, window='hann', center=True)
frames_num = stft.shape[-1]
fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
axs[0].set_ylabel('Frequency bins')
axs[0].set_title('Log spectrogram')
axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
axs[1].xaxis.set_ticks(np.arange(0, frames_num, self.frames_per_second))
axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / self.frames_per_second))
axs[1].yaxis.set_ticks(np.arange(0, top_k))
axs[1].yaxis.set_ticklabels(np.array(self.labels)[sorted_indexes[0 : top_k]])
axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
axs[1].set_xlabel('Seconds')
axs[1].xaxis.set_ticks_position('bottom')
plt.tight_layout()
image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
plt.savefig(image_filename)
return image_filename
class SoundExtraction:
def __init__(self, device):
self.device = device
self.model_file = 'sound_extraction/useful_ckpts/LASSNet.pt'
self.stft = STFT()
import torch.nn as nn
self.model = nn.DataParallel(LASSNet(device)).to(device)
checkpoint = torch.load(self.model_file)
self.model.load_state_dict(checkpoint['model'])
self.model.eval()
def inference(self, inputs):
#key = ['ref_audio', 'text']
val = inputs.split(",")
audio_path = val[0] # audio_path, text
text = val[1]
waveform = load_wav(audio_path)
waveform = torch.tensor(waveform).transpose(1,0)
mixed_mag, mixed_phase = self.stft.transform(waveform)
text_query = ['[CLS] ' + text]
mixed_mag = mixed_mag.transpose(2,1).unsqueeze(0).to(self.device)
est_mask = self.model(mixed_mag, text_query)
est_mag = est_mask * mixed_mag
est_mag = est_mag.squeeze(1)
est_mag = est_mag.permute(0, 2, 1)
est_wav = self.stft.inverse(est_mag.cpu().detach(), mixed_phase)
est_wav = est_wav.squeeze(0).squeeze(0).numpy()
#est_path = f'output/est{i}.wav'
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
print('audio_filename ', audio_filename)
save_wav(est_wav, audio_filename)
return audio_filename
class Binaural:
def __init__(self, device):
self.device = device
self.model_file = 'mono2binaural/useful_ckpts/m2b/binaural_network.net'
self.position_file = ['mono2binaural/useful_ckpts/m2b/tx_positions.txt',
'mono2binaural/useful_ckpts/m2b/tx_positions2.txt',
'mono2binaural/useful_ckpts/m2b/tx_positions3.txt',
'mono2binaural/useful_ckpts/m2b/tx_positions4.txt',
'mono2binaural/useful_ckpts/m2b/tx_positions5.txt']
self.net = BinauralNetwork(view_dim=7,
warpnet_layers=4,
warpnet_channels=64,
)
self.net.load_from_file(self.model_file)
self.sr = 48000
def inference(self, audio_path):
mono, sr = librosa.load(path=audio_path, sr=self.sr, mono=True)
mono = torch.from_numpy(mono)
mono = mono.unsqueeze(0)
import numpy as np
import random
rand_int = random.randint(0,4)
view = np.loadtxt(self.position_file[rand_int]).transpose().astype(np.float32)
view = torch.from_numpy(view)
if not view.shape[-1] * 400 == mono.shape[-1]:
mono = mono[:,:(mono.shape[-1]//400)*400] #
if view.shape[1]*400 > mono.shape[1]:
m_a = view.shape[1] - mono.shape[-1]//400
rand_st = random.randint(0,m_a)
view = view[:,m_a:m_a+(mono.shape[-1]//400)] #
# binauralize and save output
self.net.eval().to(self.device)
mono, view = mono.to(self.device), view.to(self.device)
chunk_size = 48000 # forward in chunks of 1s
rec_field = 1000 # add 1000 samples as "safe bet" since warping has undefined rec. field
rec_field -= rec_field % 400 # make sure rec_field is a multiple of 400 to match audio and view frequencies
chunks = [
{
"mono": mono[:, max(0, i-rec_field):i+chunk_size],
"view": view[:, max(0, i-rec_field)//400:(i+chunk_size)//400]
}
for i in range(0, mono.shape[-1], chunk_size)
]
for i, chunk in enumerate(chunks):
with torch.no_grad():
mono = chunk["mono"].unsqueeze(0)
view = chunk["view"].unsqueeze(0)
binaural = self.net(mono, view).squeeze(0)
if i > 0:
binaural = binaural[:, -(mono.shape[-1]-rec_field):]
chunk["binaural"] = binaural
binaural = torch.cat([chunk["binaural"] for chunk in chunks], dim=-1)
binaural = torch.clamp(binaural, min=-1, max=1).cpu()
#binaural = chunked_forwarding(net, mono, view)
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
import torchaudio
torchaudio.save(audio_filename, binaural, sr)
#soundfile.write(audio_filename, binaural, samplerate = 48000)
print(f"Processed Binaural.run, audio_filename: {audio_filename}")
return audio_filename
class TargetSoundDetection:
def __init__(self, device):
self.device = device
self.MEL_ARGS = {
'n_mels': 64,
'n_fft': 2048,
'hop_length': int(22050 * 20 / 1000),
'win_length': int(22050 * 40 / 1000)
}
self.EPS = np.spacing(1)
self.clip_model, _ = clip.load("ViT-B/32", device=self.device)
self.event_labels = event_labels
self.id_to_event = {i : label for i, label in enumerate(self.event_labels)}
config = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth', map_location='cpu')
config_parameters = dict(config)
config_parameters['tao'] = 0.6
if 'thres' not in config_parameters.keys():
config_parameters['thres'] = 0.5
if 'time_resolution' not in config_parameters.keys():
config_parameters['time_resolution'] = 125
model_parameters = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt'
, map_location=lambda storage, loc: storage) # load parameter
self.model = getattr(tsd_models, config_parameters['model'])(config_parameters,
inputdim=64, outputdim=2, time_resolution=config_parameters['time_resolution'], **config_parameters['model_args'])
self.model.load_state_dict(model_parameters)
self.model = self.model.to(self.device).eval()
self.re_embeds = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth')
self.ref_mel = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth')
def extract_feature(self, fname):
import soundfile as sf
y, sr = sf.read(fname, dtype='float32')
print('y ', y.shape)
ti = y.shape[0]/sr
if y.ndim > 1:
y = y.mean(1)
y = librosa.resample(y, sr, 22050)
lms_feature = np.log(librosa.feature.melspectrogram(y, **self.MEL_ARGS) + self.EPS).T
return lms_feature,ti
def build_clip(self, text):
text = clip.tokenize(text).to(self.device) # ["a diagram with dog", "a dog", "a cat"]
text_features = self.clip_model.encode_text(text)
return text_features
def cal_similarity(self, target, retrievals):
ans = []
#target =torch.from_numpy(target)
for name in retrievals.keys():
tmp = retrievals[name]
#tmp = torch.from_numpy(tmp)
s = torch.cosine_similarity(target.squeeze(), tmp.squeeze(), dim=0)
ans.append(s.item())
return ans.index(max(ans))
def inference(self, text, audio_path):
target_emb = self.build_clip(text) # torch type
idx = self.cal_similarity(target_emb, self.re_embeds)
target_event = self.id_to_event[idx]
embedding = self.ref_mel[target_event]
embedding = torch.from_numpy(embedding)
embedding = embedding.unsqueeze(0).to(self.device).float()
#print('embedding ', embedding.shape)
inputs,ti = self.extract_feature(audio_path)
#print('ti ', ti)
inputs = torch.from_numpy(inputs)
inputs = inputs.unsqueeze(0).to(self.device).float()
#print('inputs ', inputs.shape)
decision, decision_up, logit = self.model(inputs, embedding)
pred = decision_up.detach().cpu().numpy()
pred = pred[:,:,0]
frame_num = decision_up.shape[1]
time_ratio = ti / frame_num
filtered_pred = median_filter(pred, window_size=1, threshold=0.5)
#print('filtered_pred ', filtered_pred)
time_predictions = []
for index_k in range(filtered_pred.shape[0]):
decoded_pred = []
decoded_pred_ = decode_with_timestamps(target_event, filtered_pred[index_k,:])
if len(decoded_pred_) == 0: # neg deal
decoded_pred_.append((target_event, 0, 0))
decoded_pred.append(decoded_pred_)
for num_batch in range(len(decoded_pred)): # when we test our model,the batch_size is 1
cur_pred = pred[num_batch]
# Save each frame output, for later visualization
label_prediction = decoded_pred[num_batch] # frame predict
# print(label_prediction)
for event_label, onset, offset in label_prediction:
time_predictions.append({
'onset': onset*time_ratio,
'offset': offset*time_ratio,})
ans = ''
for i,item in enumerate(time_predictions):
ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + ' end_time: ' + str(item['offset']) + '\t'
#print(ans)
return ans
class ConversationBot:
def __init__(self):
print("Initializing AudioChatGPT")
@@ -525,6 +793,10 @@ class ConversationBot:
self.asr = ASR(device="cuda:1")
self.inpaint = Inpaint(device="cuda:0")
self.tts_ood = TTS_OOD(device="cuda:0")
self.detection = SoundDetection(device="cuda:0")
self.binaural = Binaural(device="cuda:1")
self.extraction = SoundExtraction(device="cuda:0")
self.TSD = TargetSoundDetection(device="cuda:1")
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
self.tools = [
Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -561,7 +833,19 @@ class ConversationBot:
"The input to this tool should be a string, representing the audio_path."),
Tool(name="Transcribe speech", func=self.asr.inference,
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
"The input to this tool should be a string, representing the audio_path.")]
"The input to this tool should be a string, representing the audio_path."),
Tool(name="Detect the sound event from the audio", func=self.detection.inference,
description="useful for when you want to know what event in the audio and the sound event start or end time, receives audio_path as input. "
"The input to this tool should be a string, representing the audio_path. "),
Tool(name="Sythesize binaural audio from a mono audio input", func=self.binaural.inference,
description="useful for when you want to transfer your mono audio into binaural audio, receives audio_path as input. "
"The input to this tool should be a string, representing the audio_path. "),
Tool(name="Extract sound event from mixture audio based on language description", func=self.extraction.inference,
description="useful for when you extract target sound from a mixture audio, you can describe the taregt sound by text, receives audio_path and text as input. "
"The input to this tool should be a comma seperated string of two, representing mixture audio path and input text."),
Tool(name="Detect the sound event from the audio based on your descriptions", func=self.TSD.inference,
description="useful for when you want to know the when happens the target sound event in th audio. You can use language descriptions to instruct the model. receives text description and audio_path as input. "
"The input to this tool should be a string, representing the answer. ")]
self.agent = initialize_agent(
self.tools,
self.llm,

View File

View File

View File

@@ -0,0 +1,606 @@
-5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train horn
-E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train horn
-GCwoyCnYsY_0.000_10.000.wav 0.000 10.000 Train horn
-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train horn
-Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train horn
-Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train horn
-Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train horn
-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Train horn
-nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train horn
-u9BxBNcrw4_30.000_40.000.wav 30.000 40.000 Train horn
-zqW9xCZd80_260.000_270.000.wav 260.000 270.000 Train horn
02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train horn
0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train horn
0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train horn
0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train horn
0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train horn
0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train horn
10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train horn
1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train horn
1S5WKCcf-wU_40.000_50.000.wav 40.000 50.000 Train horn
1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train horn
1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train horn
1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train horn
1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train horn
1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train horn
26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train horn
2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train horn
2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train horn
2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train horn
2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train horn
-8baTnilyjs_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
-jG26jT3fP8_230.000_240.000.wav 230.000 240.000 Air horn, truck horn
-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Air horn, truck horn
-v7cUxke-f4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
-yeWlsEpcpA_15.000_25.000.wav 15.000 25.000 Air horn, truck horn
04KOunVOkSA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
08y2LHhxmsM_400.000_410.000.wav 400.000 410.000 Air horn, truck horn
0G73yqtBwgE_11.000_21.000.wav 11.000 21.000 Air horn, truck horn
0UPY7ws-VFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
0euD32aKYUs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
1iRgwn7p0DA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
1myTsHAIvYc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
1z0XoG6GEv4_420.000_430.000.wav 420.000 430.000 Air horn, truck horn
26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Air horn, truck horn
2KmSuPb9gwA_24.000_34.000.wav 24.000 34.000 Air horn, truck horn
2Vy5NCEkg2I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
2ZciT0XrifM_0.000_8.000.wav 0.000 8.000 Air horn, truck horn
2jOzX06bzuA_16.000_26.000.wav 16.000 26.000 Air horn, truck horn
35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Air horn, truck horn
3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Air horn, truck horn
3rGOv4evODE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
42U7xIucU68_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
46r7mO2k6zY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
4EBnb2DN3Yg_13.000_23.000.wav 13.000 23.000 Air horn, truck horn
4NTjS5pFfSc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
4bvfOnX7BIE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
-ajCLjpfGKI_83.000_93.000.wav 83.000 93.000 Car alarm
-hLSc9aPOms_13.000_23.000.wav 13.000 23.000 Car alarm
-rgDWfvxxqw_30.000_40.000.wav 30.000 40.000 Car alarm
0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Car alarm
0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car alarm
0ZPafgZftWk_80.000_90.000.wav 80.000 90.000 Car alarm
0npLQ4LzD0c_40.000_50.000.wav 40.000 50.000 Car alarm
17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Car alarm
3HxQ83IMyw4_70.000_80.000.wav 70.000 80.000 Car alarm
3z05luLEc_Q_0.000_10.000.wav 0.000 10.000 Car alarm
4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Car alarm
4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car alarm
4h01lBkTVQY_18.000_28.000.wav 18.000 28.000 Car alarm
5-SzZotiaBU_30.000_40.000.wav 30.000 40.000 Car alarm
54PbkldEp9M_30.000_40.000.wav 30.000 40.000 Car alarm
5P6YYsMaIH4_30.000_40.000.wav 30.000 40.000 Car alarm
5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car alarm
7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Car alarm
7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car alarm
7NZ0kMj2HSI_54.000_64.000.wav 54.000 64.000 Car alarm
7RQpt1_1ZzU_30.000_40.000.wav 30.000 40.000 Car alarm
7ee54nr6jG8_30.000_40.000.wav 30.000 40.000 Car alarm
8OajsyPSNt8_40.000_50.000.wav 40.000 50.000 Car alarm
9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car alarm
9fzeD7CeI7Y_110.000_120.000.wav 110.000 120.000 Car alarm
9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car alarm
A-GNszKtjJc_93.000_103.000.wav 93.000 103.000 Car alarm
A437a4Y_xag_230.000_240.000.wav 230.000 240.000 Car alarm
APMPW2YI-Zk_20.000_30.000.wav 20.000 30.000 Car alarm
AR-KmtlXg4Y_70.000_80.000.wav 70.000 80.000 Car alarm
-60XojQWWoc_30.000_40.000.wav 30.000 40.000 Reversing beeps
-6d-zxMvC5E_30.000_40.000.wav 30.000 40.000 Reversing beeps
-6qSMlbJJ58_30.000_40.000.wav 30.000 40.000 Reversing beeps
-8OITuFZha8_30.000_40.000.wav 30.000 40.000 Reversing beeps
-8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Reversing beeps
-AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Reversing beeps
-AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Reversing beeps
-AXDeY-N2_M_30.000_40.000.wav 30.000 40.000 Reversing beeps
-B1uzsLG0Dk_30.000_40.000.wav 30.000 40.000 Reversing beeps
-BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Reversing beeps
-Em3OpyaefM_30.000_40.000.wav 30.000 40.000 Reversing beeps
-FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Reversing beeps
-SP7KWmTRUU_30.000_40.000.wav 30.000 40.000 Reversing beeps
-h4or05bj_I_30.000_40.000.wav 30.000 40.000 Reversing beeps
-oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Reversing beeps
-r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Reversing beeps
-s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Reversing beeps
-uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Reversing beeps
-x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Reversing beeps
-xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Reversing beeps
-zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Reversing beeps
03xMfqt4fZI_24.000_34.000.wav 24.000 34.000 Reversing beeps
0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Reversing beeps
0FQo-2xRJ0E_30.000_40.000.wav 30.000 40.000 Reversing beeps
0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Reversing beeps
0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Reversing beeps
0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Reversing beeps
0P-YGHC5cBU_30.000_40.000.wav 30.000 40.000 Reversing beeps
0QKet-tdquc_30.000_40.000.wav 30.000 40.000 Reversing beeps
0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Reversing beeps
-5px8DVPl8A_28.000_38.000.wav 28.000 38.000 Bicycle
-D08wyQwDPQ_10.000_20.000.wav 10.000 20.000 Bicycle
-F1_Gh78vJ0_30.000_40.000.wav 30.000 40.000 Bicycle
-FZQIkX44Pk_10.000_20.000.wav 10.000 20.000 Bicycle
-FsvS99nWTc_30.000_40.000.wav 30.000 40.000 Bicycle
-Holdef_BZ0_30.000_40.000.wav 30.000 40.000 Bicycle
-Inn26beF70_30.000_40.000.wav 30.000 40.000 Bicycle
-Jq9HNSs_ns_14.000_24.000.wav 14.000 24.000 Bicycle
-KlN_AXMM0Q_30.000_40.000.wav 30.000 40.000 Bicycle
-NCcqKWiGus_30.000_40.000.wav 30.000 40.000 Bicycle
-NNC_TqWfGw_30.000_40.000.wav 30.000 40.000 Bicycle
-OGFiXvmldM_30.000_40.000.wav 30.000 40.000 Bicycle
-RFpDUZhN-g_13.000_23.000.wav 13.000 23.000 Bicycle
-XUfeRTw3b4_0.000_6.000.wav 0.000 6.000 Bicycle
-XoATxJ-Qcg_30.000_40.000.wav 30.000 40.000 Bicycle
-bFNxvFwDts_470.000_480.000.wav 470.000 480.000 Bicycle
-e5PokL6Cyo_30.000_40.000.wav 30.000 40.000 Bicycle
-fNyOf9zIU0_30.000_40.000.wav 30.000 40.000 Bicycle
-fhpkRyZL90_30.000_40.000.wav 30.000 40.000 Bicycle
-fo3m0hiZbg_30.000_40.000.wav 30.000 40.000 Bicycle
-ikJkNwcmkA_27.000_37.000.wav 27.000 37.000 Bicycle
-k2nMcxAjWE_30.000_40.000.wav 30.000 40.000 Bicycle
-k80ibA-fyw_30.000_40.000.wav 30.000 40.000 Bicycle
-lBcEVa_NKw_30.000_40.000.wav 30.000 40.000 Bicycle
-mQyAYU_Bd4_50.000_60.000.wav 50.000 60.000 Bicycle
-ngrinYHF4c_30.000_40.000.wav 30.000 40.000 Bicycle
-nqm_RJ2xj8_40.000_50.000.wav 40.000 50.000 Bicycle
-oAw5iTeT1g_40.000_50.000.wav 40.000 50.000 Bicycle
-p2EMzpTE38_4.000_14.000.wav 4.000 14.000 Bicycle
-qmfWP_yzn4_30.000_40.000.wav 30.000 40.000 Bicycle
-0DIFwkUpjQ_50.000_60.000.wav 50.000 60.000 Skateboard
-53qltVyjpc_180.000_190.000.wav 180.000 190.000 Skateboard
-5y4jb9eUWs_110.000_120.000.wav 110.000 120.000 Skateboard
-81kolkG8M0_0.000_8.000.wav 0.000 8.000 Skateboard
-9dwTSq6JZg_70.000_80.000.wav 70.000 80.000 Skateboard
-9oKZsjjf_0_20.000_30.000.wav 20.000 30.000 Skateboard
-AFGfu5zOzQ_30.000_40.000.wav 30.000 40.000 Skateboard
-DHGwygUsQc_30.000_40.000.wav 30.000 40.000 Skateboard
-DkuTmIs7_Q_30.000_40.000.wav 30.000 40.000 Skateboard
-E1E17R7UBA_260.000_270.000.wav 260.000 270.000 Skateboard
-E1aIXhB4YU_30.000_40.000.wav 30.000 40.000 Skateboard
-McJLXNN3-o_50.000_60.000.wav 50.000 60.000 Skateboard
-N7nQ4CXGsY_170.000_180.000.wav 170.000 180.000 Skateboard
-O5vrHFRzcY_30.000_40.000.wav 30.000 40.000 Skateboard
-Plh9jAN_Eo_0.000_2.000.wav 0.000 2.000 Skateboard
-Qd_dXTbgK0_30.000_40.000.wav 30.000 40.000 Skateboard
-aVZ-H92M_s_0.000_4.000.wav 0.000 4.000 Skateboard
-cd-Zn8qFxU_90.000_100.000.wav 90.000 100.000 Skateboard
-esP4loyvjM_60.000_70.000.wav 60.000 70.000 Skateboard
-iB3a71aPew_30.000_40.000.wav 30.000 40.000 Skateboard
-lZapwtvwlg_0.000_10.000.wav 0.000 10.000 Skateboard
-mxMaMJCXL8_180.000_190.000.wav 180.000 190.000 Skateboard
-nYGTw9Sypg_20.000_30.000.wav 20.000 30.000 Skateboard
-oS19KshdlM_30.000_40.000.wav 30.000 40.000 Skateboard
-s6uxc77NWo_40.000_50.000.wav 40.000 50.000 Skateboard
-sCrXS2kJlA_30.000_40.000.wav 30.000 40.000 Skateboard
-saCvPTdQ7s_30.000_40.000.wav 30.000 40.000 Skateboard
-sb-knLiDic_20.000_30.000.wav 20.000 30.000 Skateboard
-tSwRvqaKWg_90.000_100.000.wav 90.000 100.000 Skateboard
-x_jV34hVq4_30.000_40.000.wav 30.000 40.000 Skateboard
--ljM2Kojag_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-4F1TX-T6T4_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-7HVWUwyMig_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-9pUUT-6o8U_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
-LGTb-xyjzA_11.000_21.000.wav 11.000 21.000 Ambulance (siren)
-Y1qiiugnk8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-ZeMV790MXE_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
-d-T8Y9-TOg_17.000_27.000.wav 17.000 27.000 Ambulance (siren)
-dcrL5JLmvo_11.000_21.000.wav 11.000 21.000 Ambulance (siren)
-fCSO8SVWZU_6.000_16.000.wav 6.000 16.000 Ambulance (siren)
-fGFQTGd2nA_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
-jnQgpHubNI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-k6p9n9y22Q_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-kr4SUjnm88_29.000_39.000.wav 29.000 39.000 Ambulance (siren)
-lyPnABQhCI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-od8LQAVgno_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-pVEgzu95Nc_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-w-9yF465IY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-woquFRnQk8_16.000_26.000.wav 16.000 26.000 Ambulance (siren)
-xz75wUCln8_50.000_60.000.wav 50.000 60.000 Ambulance (siren)
-yGElLHdkEI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-yPSgCn9AWo_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
02u3P99INjs_8.000_18.000.wav 8.000 18.000 Ambulance (siren)
06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Ambulance (siren)
0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
-0Eem_FuIto_15.000_25.000.wav 15.000 25.000 Fire engine, fire truck (siren)
-2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-4B435WQvag_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren)
-6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren)
-8uyNBFbdFc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Fire engine, fire truck (siren)
-PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Fire engine, fire truck (siren)
-QBo1W2w8II_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-QX-ddNtUvE_24.000_34.000.wav 24.000 34.000 Fire engine, fire truck (siren)
-RlUu1el2G4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-SkO97C81Ms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-T8QHPXfIC4_13.000_23.000.wav 13.000 23.000 Fire engine, fire truck (siren)
-USiTjZoh88_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-Z3ByS_RCwI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-cOjJ0Nvtlw_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren)
-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Fire engine, fire truck (siren)
-eYUCWGQ_wU_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren)
-hplTh4SGvs_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren)
-nPhg6Eu4b4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-oEGuMg8hT4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-pvaJ4DwtRg_3.000_13.000.wav 3.000 13.000 Fire engine, fire truck (siren)
-qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-sJn3uUxpH8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-sfn1NDHWJI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
-09rxiqNNEs_30.000_40.000.wav 30.000 40.000 Civil defense siren
-3qh-WFUV2U_30.000_40.000.wav 30.000 40.000 Civil defense siren
-4JG_Ag99hY_30.000_40.000.wav 30.000 40.000 Civil defense siren
-60NmEaP0is_0.000_10.000.wav 0.000 10.000 Civil defense siren
-6cTEqIcics_30.000_40.000.wav 30.000 40.000 Civil defense siren
-6iVBmb5PZU_40.000_50.000.wav 40.000 50.000 Civil defense siren
-6qp8NjWffE_30.000_40.000.wav 30.000 40.000 Civil defense siren
-75iY1j3MeY_30.000_40.000.wav 30.000 40.000 Civil defense siren
-E3Yju3lrRo_30.000_40.000.wav 30.000 40.000 Civil defense siren
-FHSBdx5A3g_40.000_50.000.wav 40.000 50.000 Civil defense siren
-JhSzxTdcwY_30.000_40.000.wav 30.000 40.000 Civil defense siren
-OtNDK_Hxp8_30.000_40.000.wav 30.000 40.000 Civil defense siren
-S3_I0RiG3g_30.000_40.000.wav 30.000 40.000 Civil defense siren
-YMXgDKKAwU_30.000_40.000.wav 30.000 40.000 Civil defense siren
-c7XoYM-SSY_30.000_40.000.wav 30.000 40.000 Civil defense siren
-j8EeIX9ynk_30.000_40.000.wav 30.000 40.000 Civil defense siren
-t478yabOQw_30.000_40.000.wav 30.000 40.000 Civil defense siren
-uIyMR9luvg_30.000_40.000.wav 30.000 40.000 Civil defense siren
-wgP6ua-t4k_40.000_50.000.wav 40.000 50.000 Civil defense siren
-zGAb18JxmI_30.000_40.000.wav 30.000 40.000 Civil defense siren
03NLMEMi8-I_30.000_40.000.wav 30.000 40.000 Civil defense siren
0552YhBdeXo_30.000_40.000.wav 30.000 40.000 Civil defense siren
06TM6z3NvuY_30.000_40.000.wav 30.000 40.000 Civil defense siren
0CUi0oGUzjU_30.000_40.000.wav 30.000 40.000 Civil defense siren
0GpUFFJNFH8_30.000_40.000.wav 30.000 40.000 Civil defense siren
0H_WUo2srs0_30.000_40.000.wav 30.000 40.000 Civil defense siren
0HvYkBXQ44A_30.000_40.000.wav 30.000 40.000 Civil defense siren
0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Civil defense siren
0JKcTVpby0I_30.000_40.000.wav 30.000 40.000 Civil defense siren
0PhU-PIsUMw_40.000_50.000.wav 40.000 50.000 Civil defense siren
-122tCXtFhU_30.000_40.000.wav 30.000 40.000 Police car (siren)
-1U98XBTyB4_30.000_40.000.wav 30.000 40.000 Police car (siren)
-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Police car (siren)
-6WqJCSmkCw_70.000_80.000.wav 70.000 80.000 Police car (siren)
-AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Police car (siren)
-AFASmp1fpk_6.000_16.000.wav 6.000 16.000 Police car (siren)
-F2lk9A8B8M_30.000_40.000.wav 30.000 40.000 Police car (siren)
-GPv09qi9A8_120.000_130.000.wav 120.000 130.000 Police car (siren)
-Hi-WpRGUpc_9.000_19.000.wav 9.000 19.000 Police car (siren)
-KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Police car (siren)
-MfBpxtGQmE_20.000_30.000.wav 20.000 30.000 Police car (siren)
-Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Police car (siren)
-UCf_-3yzWU_290.000_300.000.wav 290.000 300.000 Police car (siren)
-VULyMtKazE_0.000_7.000.wav 0.000 7.000 Police car (siren)
-XRiLbb3Syo_2.000_12.000.wav 2.000 12.000 Police car (siren)
-XrpzGb6xCU_190.000_200.000.wav 190.000 200.000 Police car (siren)
-YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Police car (siren)
-ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Police car (siren)
-_8fdnv6Crg_30.000_40.000.wav 30.000 40.000 Police car (siren)
-az6BooRLxw_40.000_50.000.wav 40.000 50.000 Police car (siren)
-bs3c27rEtc_30.000_40.000.wav 30.000 40.000 Police car (siren)
-dBTGdL4RFs_30.000_40.000.wav 30.000 40.000 Police car (siren)
-gKNRXbpAKs_30.000_40.000.wav 30.000 40.000 Police car (siren)
-hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Police car (siren)
-haSUR_IUto_30.000_40.000.wav 30.000 40.000 Police car (siren)
-l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Police car (siren)
-lWs7_49gss_30.000_40.000.wav 30.000 40.000 Police car (siren)
-lhnhB4rbGw_3.000_13.000.wav 3.000 13.000 Police car (siren)
-rkJeBBmiTQ_60.000_70.000.wav 60.000 70.000 Police car (siren)
-rs7FPxzc6w_8.000_18.000.wav 8.000 18.000 Police car (siren)
-20uudT97E0_30.000_40.000.wav 30.000 40.000 Screaming
-3bGlOhRkAo_140.000_150.000.wav 140.000 150.000 Screaming
-4pUrlMafww_1.000_11.000.wav 1.000 11.000 Screaming
-7R0ybQQAHg_60.000_70.000.wav 60.000 70.000 Screaming
-7gojlG6bE4_30.000_40.000.wav 30.000 40.000 Screaming
-GI5PbO6j50_30.000_40.000.wav 30.000 40.000 Screaming
-MuIRudOtxw_30.000_40.000.wav 30.000 40.000 Screaming
-WfQBr42ymw_30.000_40.000.wav 30.000 40.000 Screaming
-YOjIgYspsY_30.000_40.000.wav 30.000 40.000 Screaming
-g_AcRVFfXU_30.000_40.000.wav 30.000 40.000 Screaming
-gb5uvwsRpI_30.000_40.000.wav 30.000 40.000 Screaming
-iAwqlQ3TEk_0.000_3.000.wav 0.000 3.000 Screaming
-nJoxcmxz5g_30.000_40.000.wav 30.000 40.000 Screaming
-pwgypWE-J8_30.000_40.000.wav 30.000 40.000 Screaming
-pzasCR0kpc_30.000_40.000.wav 30.000 40.000 Screaming
-sUgHKZQKYc_30.000_40.000.wav 30.000 40.000 Screaming
-uazzQEmQ7c_0.000_10.000.wav 0.000 10.000 Screaming
-vHJU1wDRsY_30.000_40.000.wav 30.000 40.000 Screaming
0-RnTXpp8Q0_30.000_40.000.wav 30.000 40.000 Screaming
09YQukdYVI4_30.000_40.000.wav 30.000 40.000 Screaming
0Ees8KFCUXM_30.000_40.000.wav 30.000 40.000 Screaming
0EymGuYWkFk_30.000_40.000.wav 30.000 40.000 Screaming
0Nw1OyTsaAo_30.000_40.000.wav 30.000 40.000 Screaming
0YnOMAls83g_30.000_40.000.wav 30.000 40.000 Screaming
0_gyUQkLCY8_30.000_40.000.wav 30.000 40.000 Screaming
0_hnDV2SHBI_7.000_17.000.wav 7.000 17.000 Screaming
0cqEaAkbrbI_80.000_90.000.wav 80.000 90.000 Screaming
0hC044mDsWA_30.000_40.000.wav 30.000 40.000 Screaming
0kQANiakiH0_30.000_40.000.wav 30.000 40.000 Screaming
0rVBXpbgO8s_30.000_40.000.wav 30.000 40.000 Screaming
---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car
--330hg-Ocw_30.000_40.000.wav 30.000 40.000 Car
--8puiAGLhs_30.000_40.000.wav 30.000 40.000 Car
--9VR_F7CtY_30.000_40.000.wav 30.000 40.000 Car
--F70LWypIg_30.000_40.000.wav 30.000 40.000 Car
--P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car
--QvRbvnbUE_30.000_40.000.wav 30.000 40.000 Car
--SeOZy3Yik_30.000_40.000.wav 30.000 40.000 Car
--Zz7BgxSUg_30.000_40.000.wav 30.000 40.000 Car
--e0Vu_ruTc_30.000_40.000.wav 30.000 40.000 Car
--iFD6IyQW8_30.000_40.000.wav 30.000 40.000 Car
--jGnLqFsQ4_24.000_34.000.wav 24.000 34.000 Car
--jc0NAxK8M_30.000_40.000.wav 30.000 40.000 Car
--v1WjOJv-w_150.000_160.000.wav 150.000 160.000 Car
--xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car
--yaQA8d1dI_6.000_16.000.wav 6.000 16.000 Car
--zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car
-0-jXXldDOU_10.000_20.000.wav 10.000 20.000 Car
-03ld83JliM_29.000_39.000.wav 29.000 39.000 Car
-0B-egfXU7E_30.000_40.000.wav 30.000 40.000 Car
-0Bkyt8iZ1I_8.000_18.000.wav 8.000 18.000 Car
-0CIk-OOp7Y_30.000_40.000.wav 30.000 40.000 Car
-0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car
-0CY5NWBHyY_20.000_30.000.wav 20.000 30.000 Car
-0HsrVfb5vc_20.000_30.000.wav 20.000 30.000 Car
-0I89-H0AFo_26.000_36.000.wav 26.000 36.000 Car
-0P6VDQ1YDs_80.000_90.000.wav 80.000 90.000 Car
-0PrEsytvc0_30.000_40.000.wav 30.000 40.000 Car
-0RqnaXZu_E_30.000_40.000.wav 30.000 40.000 Car
-0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car
---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car passing by
--P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car passing by
--xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car passing by
--zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car passing by
--zbPxnl27o_20.000_30.000.wav 20.000 30.000 Car passing by
-0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car passing by
-0MnD7jBvkE_0.000_4.000.wav 0.000 4.000 Car passing by
-0U3c4PN8sc_30.000_40.000.wav 30.000 40.000 Car passing by
-0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car passing by
-10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car passing by
-14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car passing by
-15nPYi2v1g_30.000_40.000.wav 30.000 40.000 Car passing by
-19pq3HJoBM_30.000_40.000.wav 30.000 40.000 Car passing by
-1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car passing by
-1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car passing by
-1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car passing by
-1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car passing by
-2-luek6dI8_30.000_40.000.wav 30.000 40.000 Car passing by
-21-RfxQscI_30.000_40.000.wav 30.000 40.000 Car passing by
-25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car passing by
-2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car passing by
-2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car passing by
-2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car passing by
-31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car passing by
-35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car passing by
-3929cmVE20_30.000_40.000.wav 30.000 40.000 Car passing by
-3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car passing by
-3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car passing by
-3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car passing by
-3exNVlj92w_30.000_40.000.wav 30.000 40.000 Car passing by
--0w1YA1Hm4_30.000_40.000.wav 30.000 40.000 Bus
-0_vEaaXndY_11.000_21.000.wav 11.000 21.000 Bus
-5GcZwBvBdI_30.000_40.000.wav 30.000 40.000 Bus
-5digoPWn6U_8.000_18.000.wav 8.000 18.000 Bus
-79l4w4DsYM_30.000_40.000.wav 30.000 40.000 Bus
-7B4pbkIEas_30.000_40.000.wav 30.000 40.000 Bus
-8YTu7ZGA2w_30.000_40.000.wav 30.000 40.000 Bus
-93IM29_8rs_14.000_24.000.wav 14.000 24.000 Bus
-9GhPxGkpio_26.000_36.000.wav 26.000 36.000 Bus
-9J9xs7LM9Y_25.000_35.000.wav 25.000 35.000 Bus
-AY_lZLYJR8_8.000_18.000.wav 8.000 18.000 Bus
-AdQBgtN_4E_30.000_40.000.wav 30.000 40.000 Bus
-BxfsWlPUPY_30.000_40.000.wav 30.000 40.000 Bus
-CgCr8Eknm0_14.000_24.000.wav 14.000 24.000 Bus
-CnsvTDIXdE_20.000_30.000.wav 20.000 30.000 Bus
-CpMlnGhxEU_0.000_9.000.wav 0.000 9.000 Bus
-DP_cv0x_Ng_30.000_40.000.wav 30.000 40.000 Bus
-FEXRjcryZE_30.000_40.000.wav 30.000 40.000 Bus
-Fp2-w-iLiE_20.000_30.000.wav 20.000 30.000 Bus
-GLk6G9U09A_30.000_40.000.wav 30.000 40.000 Bus
-Ga9sSkpngg_30.000_40.000.wav 30.000 40.000 Bus
-H8V23dZoLo_0.000_10.000.wav 0.000 10.000 Bus
-HeQfwKbFzg_30.000_40.000.wav 30.000 40.000 Bus
-HzzEuFBiDU_30.000_40.000.wav 30.000 40.000 Bus
-I4INTpMKT4_30.000_40.000.wav 30.000 40.000 Bus
-II-7qJxKPc_21.000_31.000.wav 21.000 31.000 Bus
-LnpzyfTkF8_30.000_40.000.wav 30.000 40.000 Bus
-OgRshQfsi8_30.000_40.000.wav 30.000 40.000 Bus
-P53lJ1ViWk_30.000_40.000.wav 30.000 40.000 Bus
-PvNUvEov4Q_30.000_40.000.wav 30.000 40.000 Bus
--12UOziMF0_30.000_40.000.wav 30.000 40.000 Truck
--73E04RpiQ_0.000_9.000.wav 0.000 9.000 Truck
--J947HxQVM_0.000_9.000.wav 0.000 9.000 Truck
--bD1DVKlzQ_30.000_40.000.wav 30.000 40.000 Truck
--ivFZu-hlc_30.000_40.000.wav 30.000 40.000 Truck
--wuU7kzB5o_30.000_40.000.wav 30.000 40.000 Truck
-0B_CYyG5Dg_30.000_40.000.wav 30.000 40.000 Truck
-0JqTq_4jaE_40.000_50.000.wav 40.000 50.000 Truck
-0MrEZKJ5MQ_30.000_40.000.wav 30.000 40.000 Truck
-0awng26xQ8_30.000_40.000.wav 30.000 40.000 Truck
-0dq1Vg9rd8_30.000_40.000.wav 30.000 40.000 Truck
-0wkq7CUYME_310.000_320.000.wav 310.000 320.000 Truck
-14RXdkqYuI_30.000_40.000.wav 30.000 40.000 Truck
-1B3CzpiW1M_30.000_40.000.wav 30.000 40.000 Truck
-1Q21cZhHDE_30.000_40.000.wav 30.000 40.000 Truck
-1ZXXnBXJ6c_8.000_18.000.wav 8.000 18.000 Truck
-1s0DWApvT8_30.000_40.000.wav 30.000 40.000 Truck
-1s84_2Vn4g_30.000_40.000.wav 30.000 40.000 Truck
-26ansJluVo_30.000_40.000.wav 30.000 40.000 Truck
-2EscdO0l-A_30.000_40.000.wav 30.000 40.000 Truck
-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Truck
-2NBZUCcvm0_30.000_40.000.wav 30.000 40.000 Truck
-2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Truck
-2vmprMUw10_30.000_40.000.wav 30.000 40.000 Truck
-2x4TB8VWvE_18.000_28.000.wav 18.000 28.000 Truck
-39q4y0tt-g_30.000_40.000.wav 30.000 40.000 Truck
-3N5rjPrNCc_190.000_200.000.wav 190.000 200.000 Truck
-3NcUIyJtFY_30.000_40.000.wav 30.000 40.000 Truck
-3PplV0ErOk_30.000_40.000.wav 30.000 40.000 Truck
-3gSkrDKNSA_27.000_37.000.wav 27.000 37.000 Truck
--p-rk_HBuU_30.000_40.000.wav 30.000 40.000 Motorcycle
-1WK72M4xeg_220.000_230.000.wav 220.000 230.000 Motorcycle
-1XfuJcdvfg_30.000_40.000.wav 30.000 40.000 Motorcycle
-3XWBAmjmaQ_11.000_21.000.wav 11.000 21.000 Motorcycle
-4-87UgJcUw_70.000_80.000.wav 70.000 80.000 Motorcycle
-4D3Gkyisyc_30.000_40.000.wav 30.000 40.000 Motorcycle
-5k5GyHd2So_4.000_14.000.wav 4.000 14.000 Motorcycle
-6A2L1U9b5Y_54.000_64.000.wav 54.000 64.000 Motorcycle
-6Yfati1N10_80.000_90.000.wav 80.000 90.000 Motorcycle
-7_o_GhpZpM_12.000_22.000.wav 12.000 22.000 Motorcycle
-7rZwMK6uSs_70.000_80.000.wav 70.000 80.000 Motorcycle
-85f5DKKfSo_30.000_40.000.wav 30.000 40.000 Motorcycle
-9Smdrt5zwk_40.000_50.000.wav 40.000 50.000 Motorcycle
-9gZLVDKpnE_30.000_40.000.wav 30.000 40.000 Motorcycle
-BGebo8V4XY_30.000_40.000.wav 30.000 40.000 Motorcycle
-DdiduB5B_w_190.000_200.000.wav 190.000 200.000 Motorcycle
-HIPq7T3eFI_11.000_21.000.wav 11.000 21.000 Motorcycle
-H_3oEkKe0M_50.000_60.000.wav 50.000 60.000 Motorcycle
-HmuMoykRqA_500.000_510.000.wav 500.000 510.000 Motorcycle
-IMRE_psvtI_30.000_40.000.wav 30.000 40.000 Motorcycle
-Ie4LSPDEF4_6.000_16.000.wav 6.000 16.000 Motorcycle
-J0F29UCZiA_70.000_80.000.wav 70.000 80.000 Motorcycle
-KFCJ7ydu2E_0.000_10.000.wav 0.000 10.000 Motorcycle
-KmDAgYb0Uo_100.000_110.000.wav 100.000 110.000 Motorcycle
-P7iW3WzNfc_400.000_410.000.wav 400.000 410.000 Motorcycle
-QMAKXzIGx4_10.000_20.000.wav 10.000 20.000 Motorcycle
-S-5z2vYtxw_10.000_20.000.wav 10.000 20.000 Motorcycle
-SlL0NZh51w_30.000_40.000.wav 30.000 40.000 Motorcycle
-US2mpJxbj4_30.000_40.000.wav 30.000 40.000 Motorcycle
-VO-C9C0uqY_1.000_11.000.wav 1.000 11.000 Motorcycle
--H_-CEB2wA_30.000_40.000.wav 30.000 40.000 Train
-1VsFy0eVJs_30.000_40.000.wav 30.000 40.000 Train
-1X7kpLnOpM_60.000_70.000.wav 60.000 70.000 Train
-3FIglJti0s_30.000_40.000.wav 30.000 40.000 Train
-5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train
-6KOEEiAf9s_19.000_29.000.wav 19.000 29.000 Train
-97l_c6PToE_30.000_40.000.wav 30.000 40.000 Train
-9S5Z-uciLo_70.000_80.000.wav 70.000 80.000 Train
-CkgGfKepO4_140.000_150.000.wav 140.000 150.000 Train
-E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train
-Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train
-JpQivta6MQ_20.000_30.000.wav 20.000 30.000 Train
-K9oTZj3mVQ_30.000_40.000.wav 30.000 40.000 Train
-KjE40DlSdU_0.000_10.000.wav 0.000 10.000 Train
-NrFtZ_xxFU_30.000_40.000.wav 30.000 40.000 Train
-PYRamK58Ss_0.000_10.000.wav 0.000 10.000 Train
-P_XDJt4p_s_30.000_40.000.wav 30.000 40.000 Train
-Pjylzex7oc_350.000_360.000.wav 350.000 360.000 Train
-QHuZGmIy_I_30.000_40.000.wav 30.000 40.000 Train
-Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train
-RXKRoRPWXg_30.000_40.000.wav 30.000 40.000 Train
-VH414svzI0_30.000_40.000.wav 30.000 40.000 Train
-WFdYxE-PYI_30.000_40.000.wav 30.000 40.000 Train
-Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train
-XcC-UlbcRA_30.000_40.000.wav 30.000 40.000 Train
-Y2cD8xvCHI_30.000_40.000.wav 30.000 40.000 Train
-ZKZkMHe3cY_70.000_80.000.wav 70.000 80.000 Train
-Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train
-aZ7XC4LG2A_30.000_40.000.wav 30.000 40.000 Train
-abVemAm9HM_430.000_440.000.wav 430.000 440.000 Train
1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Ambulance (siren)
-z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Fire engine, fire truck (siren)
4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Civil defense siren
06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Police car (siren)
0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Police car (siren)
0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Police car (siren)
17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Police car (siren)
4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Police car (siren)
-10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car
-122tCXtFhU_30.000_40.000.wav 30.000 40.000 Car
-14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car
-1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car
-1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car
-1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car
-1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car
-25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car
-2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Car
-2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car
-2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car
-2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car
-31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car
-35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car
-3929cmVE20_30.000_40.000.wav 30.000 40.000 Car
-3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car
-3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car
-3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car
-AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Car
-Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Car
-VULyMtKazE_0.000_7.000.wav 0.000 7.000 Car
-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Car
06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Car
0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Car
0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car
4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car
5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car
7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car
9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car
9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car
-l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Car passing by
9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car passing by
-jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Bus
-45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Truck
-4B435WQvag_20.000_30.000.wav 20.000 30.000 Truck
-60XojQWWoc_30.000_40.000.wav 30.000 40.000 Truck
-6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Truck
-8OITuFZha8_30.000_40.000.wav 30.000 40.000 Truck
-8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Truck
-AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Truck
-AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Truck
-BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Truck
-Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Truck
-FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Truck
-Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Truck
-PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Truck
-X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Truck
-cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Truck
-oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Truck
-oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Truck
-qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Truck
-r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Truck
-s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Truck
-uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Truck
-x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Truck
-xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Truck
-zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Truck
0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Truck
0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Truck
0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Truck
0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Truck
0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Truck
3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Truck
-nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train
02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train
0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train
0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train
0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train
0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train
0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train
10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train
1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train
1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train
1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train
1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train
1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train
1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train
26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train
2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train
2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train
2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train
2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train
3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Train
1 -5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train horn
2 -E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train horn
3 -GCwoyCnYsY_0.000_10.000.wav 0.000 10.000 Train horn
4 -Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train horn
5 -Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train horn
6 -Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train horn
7 -Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train horn
8 -jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Train horn
9 -nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train horn
10 -u9BxBNcrw4_30.000_40.000.wav 30.000 40.000 Train horn
11 -zqW9xCZd80_260.000_270.000.wav 260.000 270.000 Train horn
12 02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train horn
13 0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train horn
14 0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train horn
15 0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train horn
16 0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train horn
17 0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train horn
18 10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train horn
19 1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train horn
20 1S5WKCcf-wU_40.000_50.000.wav 40.000 50.000 Train horn
21 1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train horn
22 1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train horn
23 1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train horn
24 1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train horn
25 1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train horn
26 26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train horn
27 2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train horn
28 2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train horn
29 2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train horn
30 2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train horn
31 -8baTnilyjs_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
32 -Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
33 -jG26jT3fP8_230.000_240.000.wav 230.000 240.000 Air horn, truck horn
34 -jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Air horn, truck horn
35 -v7cUxke-f4_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
36 -yeWlsEpcpA_15.000_25.000.wav 15.000 25.000 Air horn, truck horn
37 04KOunVOkSA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
38 08y2LHhxmsM_400.000_410.000.wav 400.000 410.000 Air horn, truck horn
39 0G73yqtBwgE_11.000_21.000.wav 11.000 21.000 Air horn, truck horn
40 0UPY7ws-VFs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
41 0euD32aKYUs_10.000_20.000.wav 10.000 20.000 Air horn, truck horn
42 1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
43 1iRgwn7p0DA_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
44 1myTsHAIvYc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
45 1z0XoG6GEv4_420.000_430.000.wav 420.000 430.000 Air horn, truck horn
46 26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Air horn, truck horn
47 2KmSuPb9gwA_24.000_34.000.wav 24.000 34.000 Air horn, truck horn
48 2Vy5NCEkg2I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
49 2ZciT0XrifM_0.000_8.000.wav 0.000 8.000 Air horn, truck horn
50 2jOzX06bzuA_16.000_26.000.wav 16.000 26.000 Air horn, truck horn
51 35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
52 3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Air horn, truck horn
53 3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Air horn, truck horn
54 3rGOv4evODE_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
55 42U7xIucU68_20.000_30.000.wav 20.000 30.000 Air horn, truck horn
56 46r7mO2k6zY_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
57 4EBnb2DN3Yg_13.000_23.000.wav 13.000 23.000 Air horn, truck horn
58 4NTjS5pFfSc_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
59 4bvfOnX7BIE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
60 4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Air horn, truck horn
61 -ajCLjpfGKI_83.000_93.000.wav 83.000 93.000 Car alarm
62 -hLSc9aPOms_13.000_23.000.wav 13.000 23.000 Car alarm
63 -rgDWfvxxqw_30.000_40.000.wav 30.000 40.000 Car alarm
64 0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Car alarm
65 0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car alarm
66 0ZPafgZftWk_80.000_90.000.wav 80.000 90.000 Car alarm
67 0npLQ4LzD0c_40.000_50.000.wav 40.000 50.000 Car alarm
68 17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Car alarm
69 3HxQ83IMyw4_70.000_80.000.wav 70.000 80.000 Car alarm
70 3z05luLEc_Q_0.000_10.000.wav 0.000 10.000 Car alarm
71 4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Car alarm
72 4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car alarm
73 4h01lBkTVQY_18.000_28.000.wav 18.000 28.000 Car alarm
74 5-SzZotiaBU_30.000_40.000.wav 30.000 40.000 Car alarm
75 54PbkldEp9M_30.000_40.000.wav 30.000 40.000 Car alarm
76 5P6YYsMaIH4_30.000_40.000.wav 30.000 40.000 Car alarm
77 5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car alarm
78 7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Car alarm
79 7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car alarm
80 7NZ0kMj2HSI_54.000_64.000.wav 54.000 64.000 Car alarm
81 7RQpt1_1ZzU_30.000_40.000.wav 30.000 40.000 Car alarm
82 7ee54nr6jG8_30.000_40.000.wav 30.000 40.000 Car alarm
83 8OajsyPSNt8_40.000_50.000.wav 40.000 50.000 Car alarm
84 9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car alarm
85 9fzeD7CeI7Y_110.000_120.000.wav 110.000 120.000 Car alarm
86 9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car alarm
87 A-GNszKtjJc_93.000_103.000.wav 93.000 103.000 Car alarm
88 A437a4Y_xag_230.000_240.000.wav 230.000 240.000 Car alarm
89 APMPW2YI-Zk_20.000_30.000.wav 20.000 30.000 Car alarm
90 AR-KmtlXg4Y_70.000_80.000.wav 70.000 80.000 Car alarm
91 -60XojQWWoc_30.000_40.000.wav 30.000 40.000 Reversing beeps
92 -6d-zxMvC5E_30.000_40.000.wav 30.000 40.000 Reversing beeps
93 -6qSMlbJJ58_30.000_40.000.wav 30.000 40.000 Reversing beeps
94 -8OITuFZha8_30.000_40.000.wav 30.000 40.000 Reversing beeps
95 -8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Reversing beeps
96 -AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Reversing beeps
97 -AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Reversing beeps
98 -AXDeY-N2_M_30.000_40.000.wav 30.000 40.000 Reversing beeps
99 -B1uzsLG0Dk_30.000_40.000.wav 30.000 40.000 Reversing beeps
100 -BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Reversing beeps
101 -Em3OpyaefM_30.000_40.000.wav 30.000 40.000 Reversing beeps
102 -FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Reversing beeps
103 -SP7KWmTRUU_30.000_40.000.wav 30.000 40.000 Reversing beeps
104 -h4or05bj_I_30.000_40.000.wav 30.000 40.000 Reversing beeps
105 -oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Reversing beeps
106 -r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Reversing beeps
107 -s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Reversing beeps
108 -uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Reversing beeps
109 -x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Reversing beeps
110 -xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Reversing beeps
111 -zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Reversing beeps
112 03xMfqt4fZI_24.000_34.000.wav 24.000 34.000 Reversing beeps
113 0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Reversing beeps
114 0FQo-2xRJ0E_30.000_40.000.wav 30.000 40.000 Reversing beeps
115 0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Reversing beeps
116 0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Reversing beeps
117 0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Reversing beeps
118 0P-YGHC5cBU_30.000_40.000.wav 30.000 40.000 Reversing beeps
119 0QKet-tdquc_30.000_40.000.wav 30.000 40.000 Reversing beeps
120 0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Reversing beeps
121 -5px8DVPl8A_28.000_38.000.wav 28.000 38.000 Bicycle
122 -D08wyQwDPQ_10.000_20.000.wav 10.000 20.000 Bicycle
123 -F1_Gh78vJ0_30.000_40.000.wav 30.000 40.000 Bicycle
124 -FZQIkX44Pk_10.000_20.000.wav 10.000 20.000 Bicycle
125 -FsvS99nWTc_30.000_40.000.wav 30.000 40.000 Bicycle
126 -Holdef_BZ0_30.000_40.000.wav 30.000 40.000 Bicycle
127 -Inn26beF70_30.000_40.000.wav 30.000 40.000 Bicycle
128 -Jq9HNSs_ns_14.000_24.000.wav 14.000 24.000 Bicycle
129 -KlN_AXMM0Q_30.000_40.000.wav 30.000 40.000 Bicycle
130 -NCcqKWiGus_30.000_40.000.wav 30.000 40.000 Bicycle
131 -NNC_TqWfGw_30.000_40.000.wav 30.000 40.000 Bicycle
132 -OGFiXvmldM_30.000_40.000.wav 30.000 40.000 Bicycle
133 -RFpDUZhN-g_13.000_23.000.wav 13.000 23.000 Bicycle
134 -XUfeRTw3b4_0.000_6.000.wav 0.000 6.000 Bicycle
135 -XoATxJ-Qcg_30.000_40.000.wav 30.000 40.000 Bicycle
136 -bFNxvFwDts_470.000_480.000.wav 470.000 480.000 Bicycle
137 -e5PokL6Cyo_30.000_40.000.wav 30.000 40.000 Bicycle
138 -fNyOf9zIU0_30.000_40.000.wav 30.000 40.000 Bicycle
139 -fhpkRyZL90_30.000_40.000.wav 30.000 40.000 Bicycle
140 -fo3m0hiZbg_30.000_40.000.wav 30.000 40.000 Bicycle
141 -ikJkNwcmkA_27.000_37.000.wav 27.000 37.000 Bicycle
142 -k2nMcxAjWE_30.000_40.000.wav 30.000 40.000 Bicycle
143 -k80ibA-fyw_30.000_40.000.wav 30.000 40.000 Bicycle
144 -lBcEVa_NKw_30.000_40.000.wav 30.000 40.000 Bicycle
145 -mQyAYU_Bd4_50.000_60.000.wav 50.000 60.000 Bicycle
146 -ngrinYHF4c_30.000_40.000.wav 30.000 40.000 Bicycle
147 -nqm_RJ2xj8_40.000_50.000.wav 40.000 50.000 Bicycle
148 -oAw5iTeT1g_40.000_50.000.wav 40.000 50.000 Bicycle
149 -p2EMzpTE38_4.000_14.000.wav 4.000 14.000 Bicycle
150 -qmfWP_yzn4_30.000_40.000.wav 30.000 40.000 Bicycle
151 -0DIFwkUpjQ_50.000_60.000.wav 50.000 60.000 Skateboard
152 -53qltVyjpc_180.000_190.000.wav 180.000 190.000 Skateboard
153 -5y4jb9eUWs_110.000_120.000.wav 110.000 120.000 Skateboard
154 -81kolkG8M0_0.000_8.000.wav 0.000 8.000 Skateboard
155 -9dwTSq6JZg_70.000_80.000.wav 70.000 80.000 Skateboard
156 -9oKZsjjf_0_20.000_30.000.wav 20.000 30.000 Skateboard
157 -AFGfu5zOzQ_30.000_40.000.wav 30.000 40.000 Skateboard
158 -DHGwygUsQc_30.000_40.000.wav 30.000 40.000 Skateboard
159 -DkuTmIs7_Q_30.000_40.000.wav 30.000 40.000 Skateboard
160 -E1E17R7UBA_260.000_270.000.wav 260.000 270.000 Skateboard
161 -E1aIXhB4YU_30.000_40.000.wav 30.000 40.000 Skateboard
162 -McJLXNN3-o_50.000_60.000.wav 50.000 60.000 Skateboard
163 -N7nQ4CXGsY_170.000_180.000.wav 170.000 180.000 Skateboard
164 -O5vrHFRzcY_30.000_40.000.wav 30.000 40.000 Skateboard
165 -Plh9jAN_Eo_0.000_2.000.wav 0.000 2.000 Skateboard
166 -Qd_dXTbgK0_30.000_40.000.wav 30.000 40.000 Skateboard
167 -aVZ-H92M_s_0.000_4.000.wav 0.000 4.000 Skateboard
168 -cd-Zn8qFxU_90.000_100.000.wav 90.000 100.000 Skateboard
169 -esP4loyvjM_60.000_70.000.wav 60.000 70.000 Skateboard
170 -iB3a71aPew_30.000_40.000.wav 30.000 40.000 Skateboard
171 -lZapwtvwlg_0.000_10.000.wav 0.000 10.000 Skateboard
172 -mxMaMJCXL8_180.000_190.000.wav 180.000 190.000 Skateboard
173 -nYGTw9Sypg_20.000_30.000.wav 20.000 30.000 Skateboard
174 -oS19KshdlM_30.000_40.000.wav 30.000 40.000 Skateboard
175 -s6uxc77NWo_40.000_50.000.wav 40.000 50.000 Skateboard
176 -sCrXS2kJlA_30.000_40.000.wav 30.000 40.000 Skateboard
177 -saCvPTdQ7s_30.000_40.000.wav 30.000 40.000 Skateboard
178 -sb-knLiDic_20.000_30.000.wav 20.000 30.000 Skateboard
179 -tSwRvqaKWg_90.000_100.000.wav 90.000 100.000 Skateboard
180 -x_jV34hVq4_30.000_40.000.wav 30.000 40.000 Skateboard
181 --ljM2Kojag_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
182 -4F1TX-T6T4_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
183 -7HVWUwyMig_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
184 -9pUUT-6o8U_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
185 -Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Ambulance (siren)
186 -LGTb-xyjzA_11.000_21.000.wav 11.000 21.000 Ambulance (siren)
187 -Y1qiiugnk8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
188 -YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
189 -ZeMV790MXE_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
190 -d-T8Y9-TOg_17.000_27.000.wav 17.000 27.000 Ambulance (siren)
191 -dcrL5JLmvo_11.000_21.000.wav 11.000 21.000 Ambulance (siren)
192 -fCSO8SVWZU_6.000_16.000.wav 6.000 16.000 Ambulance (siren)
193 -fGFQTGd2nA_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
194 -hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Ambulance (siren)
195 -jnQgpHubNI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
196 -k6p9n9y22Q_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
197 -kr4SUjnm88_29.000_39.000.wav 29.000 39.000 Ambulance (siren)
198 -lyPnABQhCI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
199 -od8LQAVgno_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
200 -pVEgzu95Nc_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
201 -w-9yF465IY_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
202 -woquFRnQk8_16.000_26.000.wav 16.000 26.000 Ambulance (siren)
203 -xz75wUCln8_50.000_60.000.wav 50.000 60.000 Ambulance (siren)
204 -yGElLHdkEI_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
205 -yPSgCn9AWo_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
206 -z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
207 00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
208 02u3P99INjs_8.000_18.000.wav 8.000 18.000 Ambulance (siren)
209 06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Ambulance (siren)
210 0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
211 -0Eem_FuIto_15.000_25.000.wav 15.000 25.000 Fire engine, fire truck (siren)
212 -2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
213 -45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
214 -4B435WQvag_20.000_30.000.wav 20.000 30.000 Fire engine, fire truck (siren)
215 -6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren)
216 -8uyNBFbdFc_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
217 -Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
218 -KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Fire engine, fire truck (siren)
219 -PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Fire engine, fire truck (siren)
220 -QBo1W2w8II_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
221 -QX-ddNtUvE_24.000_34.000.wav 24.000 34.000 Fire engine, fire truck (siren)
222 -RlUu1el2G4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
223 -SkO97C81Ms_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
224 -T8QHPXfIC4_13.000_23.000.wav 13.000 23.000 Fire engine, fire truck (siren)
225 -USiTjZoh88_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
226 -X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
227 -Z3ByS_RCwI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
228 -ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
229 -cOjJ0Nvtlw_23.000_33.000.wav 23.000 33.000 Fire engine, fire truck (siren)
230 -cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Fire engine, fire truck (siren)
231 -eYUCWGQ_wU_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
232 -hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Fire engine, fire truck (siren)
233 -hplTh4SGvs_90.000_100.000.wav 90.000 100.000 Fire engine, fire truck (siren)
234 -nPhg6Eu4b4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
235 -oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
236 -oEGuMg8hT4_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
237 -pvaJ4DwtRg_3.000_13.000.wav 3.000 13.000 Fire engine, fire truck (siren)
238 -qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
239 -sJn3uUxpH8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
240 -sfn1NDHWJI_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
241 -09rxiqNNEs_30.000_40.000.wav 30.000 40.000 Civil defense siren
242 -3qh-WFUV2U_30.000_40.000.wav 30.000 40.000 Civil defense siren
243 -4JG_Ag99hY_30.000_40.000.wav 30.000 40.000 Civil defense siren
244 -60NmEaP0is_0.000_10.000.wav 0.000 10.000 Civil defense siren
245 -6cTEqIcics_30.000_40.000.wav 30.000 40.000 Civil defense siren
246 -6iVBmb5PZU_40.000_50.000.wav 40.000 50.000 Civil defense siren
247 -6qp8NjWffE_30.000_40.000.wav 30.000 40.000 Civil defense siren
248 -75iY1j3MeY_30.000_40.000.wav 30.000 40.000 Civil defense siren
249 -E3Yju3lrRo_30.000_40.000.wav 30.000 40.000 Civil defense siren
250 -FHSBdx5A3g_40.000_50.000.wav 40.000 50.000 Civil defense siren
251 -JhSzxTdcwY_30.000_40.000.wav 30.000 40.000 Civil defense siren
252 -OtNDK_Hxp8_30.000_40.000.wav 30.000 40.000 Civil defense siren
253 -S3_I0RiG3g_30.000_40.000.wav 30.000 40.000 Civil defense siren
254 -YMXgDKKAwU_30.000_40.000.wav 30.000 40.000 Civil defense siren
255 -c7XoYM-SSY_30.000_40.000.wav 30.000 40.000 Civil defense siren
256 -j8EeIX9ynk_30.000_40.000.wav 30.000 40.000 Civil defense siren
257 -t478yabOQw_30.000_40.000.wav 30.000 40.000 Civil defense siren
258 -uIyMR9luvg_30.000_40.000.wav 30.000 40.000 Civil defense siren
259 -wgP6ua-t4k_40.000_50.000.wav 40.000 50.000 Civil defense siren
260 -zGAb18JxmI_30.000_40.000.wav 30.000 40.000 Civil defense siren
261 03NLMEMi8-I_30.000_40.000.wav 30.000 40.000 Civil defense siren
262 0552YhBdeXo_30.000_40.000.wav 30.000 40.000 Civil defense siren
263 06TM6z3NvuY_30.000_40.000.wav 30.000 40.000 Civil defense siren
264 0CUi0oGUzjU_30.000_40.000.wav 30.000 40.000 Civil defense siren
265 0GpUFFJNFH8_30.000_40.000.wav 30.000 40.000 Civil defense siren
266 0H_WUo2srs0_30.000_40.000.wav 30.000 40.000 Civil defense siren
267 0HvYkBXQ44A_30.000_40.000.wav 30.000 40.000 Civil defense siren
268 0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Civil defense siren
269 0JKcTVpby0I_30.000_40.000.wav 30.000 40.000 Civil defense siren
270 0PhU-PIsUMw_40.000_50.000.wav 40.000 50.000 Civil defense siren
271 -122tCXtFhU_30.000_40.000.wav 30.000 40.000 Police car (siren)
272 -1U98XBTyB4_30.000_40.000.wav 30.000 40.000 Police car (siren)
273 -2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Police car (siren)
274 -6WqJCSmkCw_70.000_80.000.wav 70.000 80.000 Police car (siren)
275 -AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Police car (siren)
276 -AFASmp1fpk_6.000_16.000.wav 6.000 16.000 Police car (siren)
277 -F2lk9A8B8M_30.000_40.000.wav 30.000 40.000 Police car (siren)
278 -GPv09qi9A8_120.000_130.000.wav 120.000 130.000 Police car (siren)
279 -Hi-WpRGUpc_9.000_19.000.wav 9.000 19.000 Police car (siren)
280 -KsPTvgJJVE_350.000_360.000.wav 350.000 360.000 Police car (siren)
281 -MfBpxtGQmE_20.000_30.000.wav 20.000 30.000 Police car (siren)
282 -Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Police car (siren)
283 -UCf_-3yzWU_290.000_300.000.wav 290.000 300.000 Police car (siren)
284 -VULyMtKazE_0.000_7.000.wav 0.000 7.000 Police car (siren)
285 -XRiLbb3Syo_2.000_12.000.wav 2.000 12.000 Police car (siren)
286 -XrpzGb6xCU_190.000_200.000.wav 190.000 200.000 Police car (siren)
287 -YsrLG2K1TE_30.000_40.000.wav 30.000 40.000 Police car (siren)
288 -ZtZOcg3s7M_30.000_40.000.wav 30.000 40.000 Police car (siren)
289 -_8fdnv6Crg_30.000_40.000.wav 30.000 40.000 Police car (siren)
290 -az6BooRLxw_40.000_50.000.wav 40.000 50.000 Police car (siren)
291 -bs3c27rEtc_30.000_40.000.wav 30.000 40.000 Police car (siren)
292 -dBTGdL4RFs_30.000_40.000.wav 30.000 40.000 Police car (siren)
293 -gKNRXbpAKs_30.000_40.000.wav 30.000 40.000 Police car (siren)
294 -hA1yMrEXz0_10.000_20.000.wav 10.000 20.000 Police car (siren)
295 -haSUR_IUto_30.000_40.000.wav 30.000 40.000 Police car (siren)
296 -l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Police car (siren)
297 -lWs7_49gss_30.000_40.000.wav 30.000 40.000 Police car (siren)
298 -lhnhB4rbGw_3.000_13.000.wav 3.000 13.000 Police car (siren)
299 -rkJeBBmiTQ_60.000_70.000.wav 60.000 70.000 Police car (siren)
300 -rs7FPxzc6w_8.000_18.000.wav 8.000 18.000 Police car (siren)
301 -20uudT97E0_30.000_40.000.wav 30.000 40.000 Screaming
302 -3bGlOhRkAo_140.000_150.000.wav 140.000 150.000 Screaming
303 -4pUrlMafww_1.000_11.000.wav 1.000 11.000 Screaming
304 -7R0ybQQAHg_60.000_70.000.wav 60.000 70.000 Screaming
305 -7gojlG6bE4_30.000_40.000.wav 30.000 40.000 Screaming
306 -GI5PbO6j50_30.000_40.000.wav 30.000 40.000 Screaming
307 -MuIRudOtxw_30.000_40.000.wav 30.000 40.000 Screaming
308 -WfQBr42ymw_30.000_40.000.wav 30.000 40.000 Screaming
309 -YOjIgYspsY_30.000_40.000.wav 30.000 40.000 Screaming
310 -g_AcRVFfXU_30.000_40.000.wav 30.000 40.000 Screaming
311 -gb5uvwsRpI_30.000_40.000.wav 30.000 40.000 Screaming
312 -iAwqlQ3TEk_0.000_3.000.wav 0.000 3.000 Screaming
313 -nJoxcmxz5g_30.000_40.000.wav 30.000 40.000 Screaming
314 -pwgypWE-J8_30.000_40.000.wav 30.000 40.000 Screaming
315 -pzasCR0kpc_30.000_40.000.wav 30.000 40.000 Screaming
316 -sUgHKZQKYc_30.000_40.000.wav 30.000 40.000 Screaming
317 -uazzQEmQ7c_0.000_10.000.wav 0.000 10.000 Screaming
318 -vHJU1wDRsY_30.000_40.000.wav 30.000 40.000 Screaming
319 0-RnTXpp8Q0_30.000_40.000.wav 30.000 40.000 Screaming
320 09YQukdYVI4_30.000_40.000.wav 30.000 40.000 Screaming
321 0Ees8KFCUXM_30.000_40.000.wav 30.000 40.000 Screaming
322 0EymGuYWkFk_30.000_40.000.wav 30.000 40.000 Screaming
323 0Nw1OyTsaAo_30.000_40.000.wav 30.000 40.000 Screaming
324 0YnOMAls83g_30.000_40.000.wav 30.000 40.000 Screaming
325 0_gyUQkLCY8_30.000_40.000.wav 30.000 40.000 Screaming
326 0_hnDV2SHBI_7.000_17.000.wav 7.000 17.000 Screaming
327 0cqEaAkbrbI_80.000_90.000.wav 80.000 90.000 Screaming
328 0hC044mDsWA_30.000_40.000.wav 30.000 40.000 Screaming
329 0kQANiakiH0_30.000_40.000.wav 30.000 40.000 Screaming
330 0rVBXpbgO8s_30.000_40.000.wav 30.000 40.000 Screaming
331 ---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car
332 --330hg-Ocw_30.000_40.000.wav 30.000 40.000 Car
333 --8puiAGLhs_30.000_40.000.wav 30.000 40.000 Car
334 --9VR_F7CtY_30.000_40.000.wav 30.000 40.000 Car
335 --F70LWypIg_30.000_40.000.wav 30.000 40.000 Car
336 --P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car
337 --QvRbvnbUE_30.000_40.000.wav 30.000 40.000 Car
338 --SeOZy3Yik_30.000_40.000.wav 30.000 40.000 Car
339 --Zz7BgxSUg_30.000_40.000.wav 30.000 40.000 Car
340 --e0Vu_ruTc_30.000_40.000.wav 30.000 40.000 Car
341 --iFD6IyQW8_30.000_40.000.wav 30.000 40.000 Car
342 --jGnLqFsQ4_24.000_34.000.wav 24.000 34.000 Car
343 --jc0NAxK8M_30.000_40.000.wav 30.000 40.000 Car
344 --v1WjOJv-w_150.000_160.000.wav 150.000 160.000 Car
345 --xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car
346 --yaQA8d1dI_6.000_16.000.wav 6.000 16.000 Car
347 --zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car
348 -0-jXXldDOU_10.000_20.000.wav 10.000 20.000 Car
349 -03ld83JliM_29.000_39.000.wav 29.000 39.000 Car
350 -0B-egfXU7E_30.000_40.000.wav 30.000 40.000 Car
351 -0Bkyt8iZ1I_8.000_18.000.wav 8.000 18.000 Car
352 -0CIk-OOp7Y_30.000_40.000.wav 30.000 40.000 Car
353 -0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car
354 -0CY5NWBHyY_20.000_30.000.wav 20.000 30.000 Car
355 -0HsrVfb5vc_20.000_30.000.wav 20.000 30.000 Car
356 -0I89-H0AFo_26.000_36.000.wav 26.000 36.000 Car
357 -0P6VDQ1YDs_80.000_90.000.wav 80.000 90.000 Car
358 -0PrEsytvc0_30.000_40.000.wav 30.000 40.000 Car
359 -0RqnaXZu_E_30.000_40.000.wav 30.000 40.000 Car
360 -0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car
361 ---lTs1dxhU_30.000_40.000.wav 30.000 40.000 Car passing by
362 --P4wuph3Mc_0.000_8.000.wav 0.000 8.000 Car passing by
363 --xDffQ9Mwo_30.000_40.000.wav 30.000 40.000 Car passing by
364 --zLzL0sq3M_30.000_40.000.wav 30.000 40.000 Car passing by
365 --zbPxnl27o_20.000_30.000.wav 20.000 30.000 Car passing by
366 -0CRb8H4hzY_4.000_14.000.wav 4.000 14.000 Car passing by
367 -0MnD7jBvkE_0.000_4.000.wav 0.000 4.000 Car passing by
368 -0U3c4PN8sc_30.000_40.000.wav 30.000 40.000 Car passing by
369 -0Yynyhm1AY_14.000_24.000.wav 14.000 24.000 Car passing by
370 -10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car passing by
371 -14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car passing by
372 -15nPYi2v1g_30.000_40.000.wav 30.000 40.000 Car passing by
373 -19pq3HJoBM_30.000_40.000.wav 30.000 40.000 Car passing by
374 -1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car passing by
375 -1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car passing by
376 -1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car passing by
377 -1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car passing by
378 -2-luek6dI8_30.000_40.000.wav 30.000 40.000 Car passing by
379 -21-RfxQscI_30.000_40.000.wav 30.000 40.000 Car passing by
380 -25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car passing by
381 -2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car passing by
382 -2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car passing by
383 -2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car passing by
384 -31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car passing by
385 -35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car passing by
386 -3929cmVE20_30.000_40.000.wav 30.000 40.000 Car passing by
387 -3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car passing by
388 -3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car passing by
389 -3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car passing by
390 -3exNVlj92w_30.000_40.000.wav 30.000 40.000 Car passing by
391 --0w1YA1Hm4_30.000_40.000.wav 30.000 40.000 Bus
392 -0_vEaaXndY_11.000_21.000.wav 11.000 21.000 Bus
393 -5GcZwBvBdI_30.000_40.000.wav 30.000 40.000 Bus
394 -5digoPWn6U_8.000_18.000.wav 8.000 18.000 Bus
395 -79l4w4DsYM_30.000_40.000.wav 30.000 40.000 Bus
396 -7B4pbkIEas_30.000_40.000.wav 30.000 40.000 Bus
397 -8YTu7ZGA2w_30.000_40.000.wav 30.000 40.000 Bus
398 -93IM29_8rs_14.000_24.000.wav 14.000 24.000 Bus
399 -9GhPxGkpio_26.000_36.000.wav 26.000 36.000 Bus
400 -9J9xs7LM9Y_25.000_35.000.wav 25.000 35.000 Bus
401 -AY_lZLYJR8_8.000_18.000.wav 8.000 18.000 Bus
402 -AdQBgtN_4E_30.000_40.000.wav 30.000 40.000 Bus
403 -BxfsWlPUPY_30.000_40.000.wav 30.000 40.000 Bus
404 -CgCr8Eknm0_14.000_24.000.wav 14.000 24.000 Bus
405 -CnsvTDIXdE_20.000_30.000.wav 20.000 30.000 Bus
406 -CpMlnGhxEU_0.000_9.000.wav 0.000 9.000 Bus
407 -DP_cv0x_Ng_30.000_40.000.wav 30.000 40.000 Bus
408 -FEXRjcryZE_30.000_40.000.wav 30.000 40.000 Bus
409 -Fp2-w-iLiE_20.000_30.000.wav 20.000 30.000 Bus
410 -GLk6G9U09A_30.000_40.000.wav 30.000 40.000 Bus
411 -Ga9sSkpngg_30.000_40.000.wav 30.000 40.000 Bus
412 -H8V23dZoLo_0.000_10.000.wav 0.000 10.000 Bus
413 -HeQfwKbFzg_30.000_40.000.wav 30.000 40.000 Bus
414 -HzzEuFBiDU_30.000_40.000.wav 30.000 40.000 Bus
415 -I4INTpMKT4_30.000_40.000.wav 30.000 40.000 Bus
416 -II-7qJxKPc_21.000_31.000.wav 21.000 31.000 Bus
417 -LnpzyfTkF8_30.000_40.000.wav 30.000 40.000 Bus
418 -OgRshQfsi8_30.000_40.000.wav 30.000 40.000 Bus
419 -P53lJ1ViWk_30.000_40.000.wav 30.000 40.000 Bus
420 -PvNUvEov4Q_30.000_40.000.wav 30.000 40.000 Bus
421 --12UOziMF0_30.000_40.000.wav 30.000 40.000 Truck
422 --73E04RpiQ_0.000_9.000.wav 0.000 9.000 Truck
423 --J947HxQVM_0.000_9.000.wav 0.000 9.000 Truck
424 --bD1DVKlzQ_30.000_40.000.wav 30.000 40.000 Truck
425 --ivFZu-hlc_30.000_40.000.wav 30.000 40.000 Truck
426 --wuU7kzB5o_30.000_40.000.wav 30.000 40.000 Truck
427 -0B_CYyG5Dg_30.000_40.000.wav 30.000 40.000 Truck
428 -0JqTq_4jaE_40.000_50.000.wav 40.000 50.000 Truck
429 -0MrEZKJ5MQ_30.000_40.000.wav 30.000 40.000 Truck
430 -0awng26xQ8_30.000_40.000.wav 30.000 40.000 Truck
431 -0dq1Vg9rd8_30.000_40.000.wav 30.000 40.000 Truck
432 -0wkq7CUYME_310.000_320.000.wav 310.000 320.000 Truck
433 -14RXdkqYuI_30.000_40.000.wav 30.000 40.000 Truck
434 -1B3CzpiW1M_30.000_40.000.wav 30.000 40.000 Truck
435 -1Q21cZhHDE_30.000_40.000.wav 30.000 40.000 Truck
436 -1ZXXnBXJ6c_8.000_18.000.wav 8.000 18.000 Truck
437 -1s0DWApvT8_30.000_40.000.wav 30.000 40.000 Truck
438 -1s84_2Vn4g_30.000_40.000.wav 30.000 40.000 Truck
439 -26ansJluVo_30.000_40.000.wav 30.000 40.000 Truck
440 -2EscdO0l-A_30.000_40.000.wav 30.000 40.000 Truck
441 -2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Truck
442 -2NBZUCcvm0_30.000_40.000.wav 30.000 40.000 Truck
443 -2sT5oBBWWY_30.000_40.000.wav 30.000 40.000 Truck
444 -2vmprMUw10_30.000_40.000.wav 30.000 40.000 Truck
445 -2x4TB8VWvE_18.000_28.000.wav 18.000 28.000 Truck
446 -39q4y0tt-g_30.000_40.000.wav 30.000 40.000 Truck
447 -3N5rjPrNCc_190.000_200.000.wav 190.000 200.000 Truck
448 -3NcUIyJtFY_30.000_40.000.wav 30.000 40.000 Truck
449 -3PplV0ErOk_30.000_40.000.wav 30.000 40.000 Truck
450 -3gSkrDKNSA_27.000_37.000.wav 27.000 37.000 Truck
451 --p-rk_HBuU_30.000_40.000.wav 30.000 40.000 Motorcycle
452 -1WK72M4xeg_220.000_230.000.wav 220.000 230.000 Motorcycle
453 -1XfuJcdvfg_30.000_40.000.wav 30.000 40.000 Motorcycle
454 -3XWBAmjmaQ_11.000_21.000.wav 11.000 21.000 Motorcycle
455 -4-87UgJcUw_70.000_80.000.wav 70.000 80.000 Motorcycle
456 -4D3Gkyisyc_30.000_40.000.wav 30.000 40.000 Motorcycle
457 -5k5GyHd2So_4.000_14.000.wav 4.000 14.000 Motorcycle
458 -6A2L1U9b5Y_54.000_64.000.wav 54.000 64.000 Motorcycle
459 -6Yfati1N10_80.000_90.000.wav 80.000 90.000 Motorcycle
460 -7_o_GhpZpM_12.000_22.000.wav 12.000 22.000 Motorcycle
461 -7rZwMK6uSs_70.000_80.000.wav 70.000 80.000 Motorcycle
462 -85f5DKKfSo_30.000_40.000.wav 30.000 40.000 Motorcycle
463 -9Smdrt5zwk_40.000_50.000.wav 40.000 50.000 Motorcycle
464 -9gZLVDKpnE_30.000_40.000.wav 30.000 40.000 Motorcycle
465 -BGebo8V4XY_30.000_40.000.wav 30.000 40.000 Motorcycle
466 -DdiduB5B_w_190.000_200.000.wav 190.000 200.000 Motorcycle
467 -HIPq7T3eFI_11.000_21.000.wav 11.000 21.000 Motorcycle
468 -H_3oEkKe0M_50.000_60.000.wav 50.000 60.000 Motorcycle
469 -HmuMoykRqA_500.000_510.000.wav 500.000 510.000 Motorcycle
470 -IMRE_psvtI_30.000_40.000.wav 30.000 40.000 Motorcycle
471 -Ie4LSPDEF4_6.000_16.000.wav 6.000 16.000 Motorcycle
472 -J0F29UCZiA_70.000_80.000.wav 70.000 80.000 Motorcycle
473 -KFCJ7ydu2E_0.000_10.000.wav 0.000 10.000 Motorcycle
474 -KmDAgYb0Uo_100.000_110.000.wav 100.000 110.000 Motorcycle
475 -P7iW3WzNfc_400.000_410.000.wav 400.000 410.000 Motorcycle
476 -QMAKXzIGx4_10.000_20.000.wav 10.000 20.000 Motorcycle
477 -S-5z2vYtxw_10.000_20.000.wav 10.000 20.000 Motorcycle
478 -SlL0NZh51w_30.000_40.000.wav 30.000 40.000 Motorcycle
479 -US2mpJxbj4_30.000_40.000.wav 30.000 40.000 Motorcycle
480 -VO-C9C0uqY_1.000_11.000.wav 1.000 11.000 Motorcycle
481 --H_-CEB2wA_30.000_40.000.wav 30.000 40.000 Train
482 -1VsFy0eVJs_30.000_40.000.wav 30.000 40.000 Train
483 -1X7kpLnOpM_60.000_70.000.wav 60.000 70.000 Train
484 -3FIglJti0s_30.000_40.000.wav 30.000 40.000 Train
485 -5QrBL6MzLg_60.000_70.000.wav 60.000 70.000 Train
486 -6KOEEiAf9s_19.000_29.000.wav 19.000 29.000 Train
487 -97l_c6PToE_30.000_40.000.wav 30.000 40.000 Train
488 -9S5Z-uciLo_70.000_80.000.wav 70.000 80.000 Train
489 -CkgGfKepO4_140.000_150.000.wav 140.000 150.000 Train
490 -E0shPRxAbo_30.000_40.000.wav 30.000 40.000 Train
491 -Gbohom8C4Q_30.000_40.000.wav 30.000 40.000 Train
492 -JpQivta6MQ_20.000_30.000.wav 20.000 30.000 Train
493 -K9oTZj3mVQ_30.000_40.000.wav 30.000 40.000 Train
494 -KjE40DlSdU_0.000_10.000.wav 0.000 10.000 Train
495 -NrFtZ_xxFU_30.000_40.000.wav 30.000 40.000 Train
496 -PYRamK58Ss_0.000_10.000.wav 0.000 10.000 Train
497 -P_XDJt4p_s_30.000_40.000.wav 30.000 40.000 Train
498 -Pjylzex7oc_350.000_360.000.wav 350.000 360.000 Train
499 -QHuZGmIy_I_30.000_40.000.wav 30.000 40.000 Train
500 -Qfk_Q2ctBs_30.000_40.000.wav 30.000 40.000 Train
501 -RXKRoRPWXg_30.000_40.000.wav 30.000 40.000 Train
502 -VH414svzI0_30.000_40.000.wav 30.000 40.000 Train
503 -WFdYxE-PYI_30.000_40.000.wav 30.000 40.000 Train
504 -Wd1pV7UjWg_60.000_70.000.wav 60.000 70.000 Train
505 -XcC-UlbcRA_30.000_40.000.wav 30.000 40.000 Train
506 -Y2cD8xvCHI_30.000_40.000.wav 30.000 40.000 Train
507 -ZKZkMHe3cY_70.000_80.000.wav 70.000 80.000 Train
508 -Zq22n4OewA_30.000_40.000.wav 30.000 40.000 Train
509 -aZ7XC4LG2A_30.000_40.000.wav 30.000 40.000 Train
510 -abVemAm9HM_430.000_440.000.wav 430.000 440.000 Train
511 1T1i2rny8RU_30.000_40.000.wav 30.000 40.000 Ambulance (siren)
512 7DC3HtNi4fU_160.000_170.000.wav 160.000 170.000 Ambulance (siren)
513 -z8jsgl3iHE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
514 00H_s-krtg8_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
515 0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
516 3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Fire engine, fire truck (siren)
517 4l78f9VZ9uE_30.000_40.000.wav 30.000 40.000 Fire engine, fire truck (siren)
518 35EOmSMTQ6I_30.000_40.000.wav 30.000 40.000 Civil defense siren
519 06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Police car (siren)
520 0EPK7Pv_lbE_30.000_40.000.wav 30.000 40.000 Police car (siren)
521 0I6Mlp27_gM_30.000_40.000.wav 30.000 40.000 Police car (siren)
522 17VuPl9Wxvs_20.000_30.000.wav 20.000 30.000 Police car (siren)
523 4A1Ar1TIXIY_30.000_40.000.wav 30.000 40.000 Police car (siren)
524 -10fWp7Pqs4_30.000_40.000.wav 30.000 40.000 Car
525 -122tCXtFhU_30.000_40.000.wav 30.000 40.000 Car
526 -14BFlDzjS4_6.000_16.000.wav 6.000 16.000 Car
527 -1BrkFLHD74_19.000_29.000.wav 19.000 29.000 Car
528 -1HlfoHZCEE_6.000_16.000.wav 6.000 16.000 Car
529 -1McjOPUzbo_30.000_40.000.wav 30.000 40.000 Car
530 -1sGSNmgiPs_4.000_14.000.wav 4.000 14.000 Car
531 -25LkbSjEos_30.000_40.000.wav 30.000 40.000 Car
532 -2GlU3e0nTU_170.000_180.000.wav 170.000 180.000 Car
533 -2LJWaL2PuA_30.000_40.000.wav 30.000 40.000 Car
534 -2ZbvsBSZmY_2.000_12.000.wav 2.000 12.000 Car
535 -2cz2qQDmr4_30.000_40.000.wav 30.000 40.000 Car
536 -31KUAOSg5U_5.000_15.000.wav 5.000 15.000 Car
537 -35qBdzN9ck_30.000_40.000.wav 30.000 40.000 Car
538 -3929cmVE20_30.000_40.000.wav 30.000 40.000 Car
539 -3M-k4nIYIM_30.000_40.000.wav 30.000 40.000 Car
540 -3MNphBfq_0_30.000_40.000.wav 30.000 40.000 Car
541 -3_RSVYKkkk_30.000_40.000.wav 30.000 40.000 Car
542 -AF7wp3ezww_140.000_150.000.wav 140.000 150.000 Car
543 -Pg4vVPs4bE_30.000_40.000.wav 30.000 40.000 Car
544 -VULyMtKazE_0.000_7.000.wav 0.000 7.000 Car
545 -cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Car
546 06RreMb5qbE_0.000_10.000.wav 0.000 10.000 Car
547 0E4AqW9dmdk_30.000_40.000.wav 30.000 40.000 Car
548 0Hz4R_m0hmI_80.000_90.000.wav 80.000 90.000 Car
549 4Kpklmj-ze0_53.000_63.000.wav 53.000 63.000 Car
550 5tzTahLHylw_70.000_80.000.wav 70.000 80.000 Car
551 7NJ5TbNEIvA_250.000_260.000.wav 250.000 260.000 Car
552 9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car
553 9jYv9WuyknA_130.000_140.000.wav 130.000 140.000 Car
554 -l-DEfDAvNA_30.000_40.000.wav 30.000 40.000 Car passing by
555 9fCibkUT_gQ_30.000_40.000.wav 30.000 40.000 Car passing by
556 -jj2tyuf6-A_80.000_90.000.wav 80.000 90.000 Bus
557 -45cKZA7Jww_30.000_40.000.wav 30.000 40.000 Truck
558 -4B435WQvag_20.000_30.000.wav 20.000 30.000 Truck
559 -60XojQWWoc_30.000_40.000.wav 30.000 40.000 Truck
560 -6qhtwdfGOA_23.000_33.000.wav 23.000 33.000 Truck
561 -8OITuFZha8_30.000_40.000.wav 30.000 40.000 Truck
562 -8n2NqDFRko_30.000_40.000.wav 30.000 40.000 Truck
563 -AIrHVeCgtM_30.000_40.000.wav 30.000 40.000 Truck
564 -AVzYvKHwPg_30.000_40.000.wav 30.000 40.000 Truck
565 -BM_EAszxBg_30.000_40.000.wav 30.000 40.000 Truck
566 -Ei2LE71Dfg_20.000_30.000.wav 20.000 30.000 Truck
567 -FWkB2IDMhc_30.000_40.000.wav 30.000 40.000 Truck
568 -Jsu4dbuO4A_30.000_40.000.wav 30.000 40.000 Truck
569 -PRrNx6_MD0_16.000_26.000.wav 16.000 26.000 Truck
570 -X0vNLwH1C0_30.000_40.000.wav 30.000 40.000 Truck
571 -cbYvBBXE6A_12.000_22.000.wav 12.000 22.000 Truck
572 -oCvKmNbhl0_30.000_40.000.wav 30.000 40.000 Truck
573 -oV6dQu5tZo_30.000_40.000.wav 30.000 40.000 Truck
574 -qKRKDTbt4c_30.000_40.000.wav 30.000 40.000 Truck
575 -r8mfjRiHrU_30.000_40.000.wav 30.000 40.000 Truck
576 -s9kwrRilOY_30.000_40.000.wav 30.000 40.000 Truck
577 -uMiGr6xvRA_30.000_40.000.wav 30.000 40.000 Truck
578 -x70B12Mb-8_30.000_40.000.wav 30.000 40.000 Truck
579 -xYsfYZOI-Y_30.000_40.000.wav 30.000 40.000 Truck
580 -zxrdL6MlKI_30.000_40.000.wav 30.000 40.000 Truck
581 0C3kqtF76t8_50.000_60.000.wav 50.000 60.000 Truck
582 0HmiH-wKLB4_30.000_40.000.wav 30.000 40.000 Truck
583 0KskqFt3DoY_15.000_25.000.wav 15.000 25.000 Truck
584 0OiPtV9sd_w_30.000_40.000.wav 30.000 40.000 Truck
585 0VnoYVqd-yo_30.000_40.000.wav 30.000 40.000 Truck
586 3YaLkgUMhAA_110.000_120.000.wav 110.000 120.000 Truck
587 -nGBPqlRNg4_30.000_40.000.wav 30.000 40.000 Train
588 02w3vd_GgF0_390.000_400.000.wav 390.000 400.000 Train
589 0HqeYIREv8M_30.000_40.000.wav 30.000 40.000 Train
590 0IpYF91Fdt0_80.000_90.000.wav 80.000 90.000 Train
591 0NaZejdABG0_90.000_100.000.wav 90.000 100.000 Train
592 0RurXUfKyow_4.000_14.000.wav 4.000 14.000 Train
593 0_HnD-rW3lI_170.000_180.000.wav 170.000 180.000 Train
594 10i60V1RZkQ_210.000_220.000.wav 210.000 220.000 Train
595 1FJY5X1iY9I_170.000_180.000.wav 170.000 180.000 Train
596 1U0Ty6CW6AM_40.000_50.000.wav 40.000 50.000 Train
597 1hQLr88iCvg_30.000_40.000.wav 30.000 40.000 Train
598 1iUXERALOOs_190.000_200.000.wav 190.000 200.000 Train
599 1iWFlLpixKU_5.000_15.000.wav 5.000 15.000 Train
600 1oJAVJPX0YY_20.000_30.000.wav 20.000 30.000 Train
601 26dNsDuIt9Q_340.000_350.000.wav 340.000 350.000 Train
602 2BMHsKLcb7E_90.000_100.000.wav 90.000 100.000 Train
603 2RpOd9MJjyQ_10.000_20.000.wav 10.000 20.000 Train
604 2U4wSdl10to_200.000_210.000.wav 200.000 210.000 Train
605 2aBV6AZt5nk_570.000_580.000.wav 570.000 580.000 Train
606 3ntFslTK6hM_90.000_100.000.wav 90.000 100.000 Train

View File

@@ -0,0 +1,528 @@
index,mid,display_name
0,/m/09x0r,"Speech"
1,/m/05zppz,"Male speech, man speaking"
2,/m/02zsn,"Female speech, woman speaking"
3,/m/0ytgt,"Child speech, kid speaking"
4,/m/01h8n0,"Conversation"
5,/m/02qldy,"Narration, monologue"
6,/m/0261r1,"Babbling"
7,/m/0brhx,"Speech synthesizer"
8,/m/07p6fty,"Shout"
9,/m/07q4ntr,"Bellow"
10,/m/07rwj3x,"Whoop"
11,/m/07sr1lc,"Yell"
12,/m/04gy_2,"Battle cry"
13,/t/dd00135,"Children shouting"
14,/m/03qc9zr,"Screaming"
15,/m/02rtxlg,"Whispering"
16,/m/01j3sz,"Laughter"
17,/t/dd00001,"Baby laughter"
18,/m/07r660_,"Giggle"
19,/m/07s04w4,"Snicker"
20,/m/07sq110,"Belly laugh"
21,/m/07rgt08,"Chuckle, chortle"
22,/m/0463cq4,"Crying, sobbing"
23,/t/dd00002,"Baby cry, infant cry"
24,/m/07qz6j3,"Whimper"
25,/m/07qw_06,"Wail, moan"
26,/m/07plz5l,"Sigh"
27,/m/015lz1,"Singing"
28,/m/0l14jd,"Choir"
29,/m/01swy6,"Yodeling"
30,/m/02bk07,"Chant"
31,/m/01c194,"Mantra"
32,/t/dd00003,"Male singing"
33,/t/dd00004,"Female singing"
34,/t/dd00005,"Child singing"
35,/t/dd00006,"Synthetic singing"
36,/m/06bxc,"Rapping"
37,/m/02fxyj,"Humming"
38,/m/07s2xch,"Groan"
39,/m/07r4k75,"Grunt"
40,/m/01w250,"Whistling"
41,/m/0lyf6,"Breathing"
42,/m/07mzm6,"Wheeze"
43,/m/01d3sd,"Snoring"
44,/m/07s0dtb,"Gasp"
45,/m/07pyy8b,"Pant"
46,/m/07q0yl5,"Snort"
47,/m/01b_21,"Cough"
48,/m/0dl9sf8,"Throat clearing"
49,/m/01hsr_,"Sneeze"
50,/m/07ppn3j,"Sniff"
51,/m/06h7j,"Run"
52,/m/07qv_x_,"Shuffle"
53,/m/07pbtc8,"Walk, footsteps"
54,/m/03cczk,"Chewing, mastication"
55,/m/07pdhp0,"Biting"
56,/m/0939n_,"Gargling"
57,/m/01g90h,"Stomach rumble"
58,/m/03q5_w,"Burping, eructation"
59,/m/02p3nc,"Hiccup"
60,/m/02_nn,"Fart"
61,/m/0k65p,"Hands"
62,/m/025_jnm,"Finger snapping"
63,/m/0l15bq,"Clapping"
64,/m/01jg02,"Heart sounds, heartbeat"
65,/m/01jg1z,"Heart murmur"
66,/m/053hz1,"Cheering"
67,/m/028ght,"Applause"
68,/m/07rkbfh,"Chatter"
69,/m/03qtwd,"Crowd"
70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
71,/t/dd00013,"Children playing"
72,/m/0jbk,"Animal"
73,/m/068hy,"Domestic animals, pets"
74,/m/0bt9lr,"Dog"
75,/m/05tny_,"Bark"
76,/m/07r_k2n,"Yip"
77,/m/07qf0zm,"Howl"
78,/m/07rc7d9,"Bow-wow"
79,/m/0ghcn6,"Growling"
80,/t/dd00136,"Whimper (dog)"
81,/m/01yrx,"Cat"
82,/m/02yds9,"Purr"
83,/m/07qrkrw,"Meow"
84,/m/07rjwbb,"Hiss"
85,/m/07r81j2,"Caterwaul"
86,/m/0ch8v,"Livestock, farm animals, working animals"
87,/m/03k3r,"Horse"
88,/m/07rv9rh,"Clip-clop"
89,/m/07q5rw0,"Neigh, whinny"
90,/m/01xq0k1,"Cattle, bovinae"
91,/m/07rpkh9,"Moo"
92,/m/0239kh,"Cowbell"
93,/m/068zj,"Pig"
94,/t/dd00018,"Oink"
95,/m/03fwl,"Goat"
96,/m/07q0h5t,"Bleat"
97,/m/07bgp,"Sheep"
98,/m/025rv6n,"Fowl"
99,/m/09b5t,"Chicken, rooster"
100,/m/07st89h,"Cluck"
101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
102,/m/01rd7k,"Turkey"
103,/m/07svc2k,"Gobble"
104,/m/09ddx,"Duck"
105,/m/07qdb04,"Quack"
106,/m/0dbvp,"Goose"
107,/m/07qwf61,"Honk"
108,/m/01280g,"Wild animals"
109,/m/0cdnk,"Roaring cats (lions, tigers)"
110,/m/04cvmfc,"Roar"
111,/m/015p6,"Bird"
112,/m/020bb7,"Bird vocalization, bird call, bird song"
113,/m/07pggtn,"Chirp, tweet"
114,/m/07sx8x_,"Squawk"
115,/m/0h0rv,"Pigeon, dove"
116,/m/07r_25d,"Coo"
117,/m/04s8yn,"Crow"
118,/m/07r5c2p,"Caw"
119,/m/09d5_,"Owl"
120,/m/07r_80w,"Hoot"
121,/m/05_wcq,"Bird flight, flapping wings"
122,/m/01z5f,"Canidae, dogs, wolves"
123,/m/06hps,"Rodents, rats, mice"
124,/m/04rmv,"Mouse"
125,/m/07r4gkf,"Patter"
126,/m/03vt0,"Insect"
127,/m/09xqv,"Cricket"
128,/m/09f96,"Mosquito"
129,/m/0h2mp,"Fly, housefly"
130,/m/07pjwq1,"Buzz"
131,/m/01h3n,"Bee, wasp, etc."
132,/m/09ld4,"Frog"
133,/m/07st88b,"Croak"
134,/m/078jl,"Snake"
135,/m/07qn4z3,"Rattle"
136,/m/032n05,"Whale vocalization"
137,/m/04rlf,"Music"
138,/m/04szw,"Musical instrument"
139,/m/0fx80y,"Plucked string instrument"
140,/m/0342h,"Guitar"
141,/m/02sgy,"Electric guitar"
142,/m/018vs,"Bass guitar"
143,/m/042v_gx,"Acoustic guitar"
144,/m/06w87,"Steel guitar, slide guitar"
145,/m/01glhc,"Tapping (guitar technique)"
146,/m/07s0s5r,"Strum"
147,/m/018j2,"Banjo"
148,/m/0jtg0,"Sitar"
149,/m/04rzd,"Mandolin"
150,/m/01bns_,"Zither"
151,/m/07xzm,"Ukulele"
152,/m/05148p4,"Keyboard (musical)"
153,/m/05r5c,"Piano"
154,/m/01s0ps,"Electric piano"
155,/m/013y1f,"Organ"
156,/m/03xq_f,"Electronic organ"
157,/m/03gvt,"Hammond organ"
158,/m/0l14qv,"Synthesizer"
159,/m/01v1d8,"Sampler"
160,/m/03q5t,"Harpsichord"
161,/m/0l14md,"Percussion"
162,/m/02hnl,"Drum kit"
163,/m/0cfdd,"Drum machine"
164,/m/026t6,"Drum"
165,/m/06rvn,"Snare drum"
166,/m/03t3fj,"Rimshot"
167,/m/02k_mr,"Drum roll"
168,/m/0bm02,"Bass drum"
169,/m/011k_j,"Timpani"
170,/m/01p970,"Tabla"
171,/m/01qbl,"Cymbal"
172,/m/03qtq,"Hi-hat"
173,/m/01sm1g,"Wood block"
174,/m/07brj,"Tambourine"
175,/m/05r5wn,"Rattle (instrument)"
176,/m/0xzly,"Maraca"
177,/m/0mbct,"Gong"
178,/m/016622,"Tubular bells"
179,/m/0j45pbj,"Mallet percussion"
180,/m/0dwsp,"Marimba, xylophone"
181,/m/0dwtp,"Glockenspiel"
182,/m/0dwt5,"Vibraphone"
183,/m/0l156b,"Steelpan"
184,/m/05pd6,"Orchestra"
185,/m/01kcd,"Brass instrument"
186,/m/0319l,"French horn"
187,/m/07gql,"Trumpet"
188,/m/07c6l,"Trombone"
189,/m/0l14_3,"Bowed string instrument"
190,/m/02qmj0d,"String section"
191,/m/07y_7,"Violin, fiddle"
192,/m/0d8_n,"Pizzicato"
193,/m/01xqw,"Cello"
194,/m/02fsn,"Double bass"
195,/m/085jw,"Wind instrument, woodwind instrument"
196,/m/0l14j_,"Flute"
197,/m/06ncr,"Saxophone"
198,/m/01wy6,"Clarinet"
199,/m/03m5k,"Harp"
200,/m/0395lw,"Bell"
201,/m/03w41f,"Church bell"
202,/m/027m70_,"Jingle bell"
203,/m/0gy1t2s,"Bicycle bell"
204,/m/07n_g,"Tuning fork"
205,/m/0f8s22,"Chime"
206,/m/026fgl,"Wind chime"
207,/m/0150b9,"Change ringing (campanology)"
208,/m/03qjg,"Harmonica"
209,/m/0mkg,"Accordion"
210,/m/0192l,"Bagpipes"
211,/m/02bxd,"Didgeridoo"
212,/m/0l14l2,"Shofar"
213,/m/07kc_,"Theremin"
214,/m/0l14t7,"Singing bowl"
215,/m/01hgjl,"Scratching (performance technique)"
216,/m/064t9,"Pop music"
217,/m/0glt670,"Hip hop music"
218,/m/02cz_7,"Beatboxing"
219,/m/06by7,"Rock music"
220,/m/03lty,"Heavy metal"
221,/m/05r6t,"Punk rock"
222,/m/0dls3,"Grunge"
223,/m/0dl5d,"Progressive rock"
224,/m/07sbbz2,"Rock and roll"
225,/m/05w3f,"Psychedelic rock"
226,/m/06j6l,"Rhythm and blues"
227,/m/0gywn,"Soul music"
228,/m/06cqb,"Reggae"
229,/m/01lyv,"Country"
230,/m/015y_n,"Swing music"
231,/m/0gg8l,"Bluegrass"
232,/m/02x8m,"Funk"
233,/m/02w4v,"Folk music"
234,/m/06j64v,"Middle Eastern music"
235,/m/03_d0,"Jazz"
236,/m/026z9,"Disco"
237,/m/0ggq0m,"Classical music"
238,/m/05lls,"Opera"
239,/m/02lkt,"Electronic music"
240,/m/03mb9,"House music"
241,/m/07gxw,"Techno"
242,/m/07s72n,"Dubstep"
243,/m/0283d,"Drum and bass"
244,/m/0m0jc,"Electronica"
245,/m/08cyft,"Electronic dance music"
246,/m/0fd3y,"Ambient music"
247,/m/07lnk,"Trance music"
248,/m/0g293,"Music of Latin America"
249,/m/0ln16,"Salsa music"
250,/m/0326g,"Flamenco"
251,/m/0155w,"Blues"
252,/m/05fw6t,"Music for children"
253,/m/02v2lh,"New-age music"
254,/m/0y4f8,"Vocal music"
255,/m/0z9c,"A capella"
256,/m/0164x2,"Music of Africa"
257,/m/0145m,"Afrobeat"
258,/m/02mscn,"Christian music"
259,/m/016cjb,"Gospel music"
260,/m/028sqc,"Music of Asia"
261,/m/015vgc,"Carnatic music"
262,/m/0dq0md,"Music of Bollywood"
263,/m/06rqw,"Ska"
264,/m/02p0sh1,"Traditional music"
265,/m/05rwpb,"Independent music"
266,/m/074ft,"Song"
267,/m/025td0t,"Background music"
268,/m/02cjck,"Theme music"
269,/m/03r5q_,"Jingle (music)"
270,/m/0l14gg,"Soundtrack music"
271,/m/07pkxdp,"Lullaby"
272,/m/01z7dr,"Video game music"
273,/m/0140xf,"Christmas music"
274,/m/0ggx5q,"Dance music"
275,/m/04wptg,"Wedding music"
276,/t/dd00031,"Happy music"
277,/t/dd00032,"Funny music"
278,/t/dd00033,"Sad music"
279,/t/dd00034,"Tender music"
280,/t/dd00035,"Exciting music"
281,/t/dd00036,"Angry music"
282,/t/dd00037,"Scary music"
283,/m/03m9d0z,"Wind"
284,/m/09t49,"Rustling leaves"
285,/t/dd00092,"Wind noise (microphone)"
286,/m/0jb2l,"Thunderstorm"
287,/m/0ngt1,"Thunder"
288,/m/0838f,"Water"
289,/m/06mb1,"Rain"
290,/m/07r10fb,"Raindrop"
291,/t/dd00038,"Rain on surface"
292,/m/0j6m2,"Stream"
293,/m/0j2kx,"Waterfall"
294,/m/05kq4,"Ocean"
295,/m/034srq,"Waves, surf"
296,/m/06wzb,"Steam"
297,/m/07swgks,"Gurgling"
298,/m/02_41,"Fire"
299,/m/07pzfmf,"Crackle"
300,/m/07yv9,"Vehicle"
301,/m/019jd,"Boat, Water vehicle"
302,/m/0hsrw,"Sailboat, sailing ship"
303,/m/056ks2,"Rowboat, canoe, kayak"
304,/m/02rlv9,"Motorboat, speedboat"
305,/m/06q74,"Ship"
306,/m/012f08,"Motor vehicle (road)"
307,/m/0k4j,"Car"
308,/m/0912c9,"Vehicle horn, car horn, honking"
309,/m/07qv_d5,"Toot"
310,/m/02mfyn,"Car alarm"
311,/m/04gxbd,"Power windows, electric windows"
312,/m/07rknqz,"Skidding"
313,/m/0h9mv,"Tire squeal"
314,/t/dd00134,"Car passing by"
315,/m/0ltv,"Race car, auto racing"
316,/m/07r04,"Truck"
317,/m/0gvgw0,"Air brake"
318,/m/05x_td,"Air horn, truck horn"
319,/m/02rhddq,"Reversing beeps"
320,/m/03cl9h,"Ice cream truck, ice cream van"
321,/m/01bjv,"Bus"
322,/m/03j1ly,"Emergency vehicle"
323,/m/04qvtq,"Police car (siren)"
324,/m/012n7d,"Ambulance (siren)"
325,/m/012ndj,"Fire engine, fire truck (siren)"
326,/m/04_sv,"Motorcycle"
327,/m/0btp2,"Traffic noise, roadway noise"
328,/m/06d_3,"Rail transport"
329,/m/07jdr,"Train"
330,/m/04zmvq,"Train whistle"
331,/m/0284vy3,"Train horn"
332,/m/01g50p,"Railroad car, train wagon"
333,/t/dd00048,"Train wheels squealing"
334,/m/0195fx,"Subway, metro, underground"
335,/m/0k5j,"Aircraft"
336,/m/014yck,"Aircraft engine"
337,/m/04229,"Jet engine"
338,/m/02l6bg,"Propeller, airscrew"
339,/m/09ct_,"Helicopter"
340,/m/0cmf2,"Fixed-wing aircraft, airplane"
341,/m/0199g,"Bicycle"
342,/m/06_fw,"Skateboard"
343,/m/02mk9,"Engine"
344,/t/dd00065,"Light engine (high frequency)"
345,/m/08j51y,"Dental drill, dentist's drill"
346,/m/01yg9g,"Lawn mower"
347,/m/01j4z9,"Chainsaw"
348,/t/dd00066,"Medium engine (mid frequency)"
349,/t/dd00067,"Heavy engine (low frequency)"
350,/m/01h82_,"Engine knocking"
351,/t/dd00130,"Engine starting"
352,/m/07pb8fc,"Idling"
353,/m/07q2z82,"Accelerating, revving, vroom"
354,/m/02dgv,"Door"
355,/m/03wwcy,"Doorbell"
356,/m/07r67yg,"Ding-dong"
357,/m/02y_763,"Sliding door"
358,/m/07rjzl8,"Slam"
359,/m/07r4wb8,"Knock"
360,/m/07qcpgn,"Tap"
361,/m/07q6cd_,"Squeak"
362,/m/0642b4,"Cupboard open or close"
363,/m/0fqfqc,"Drawer open or close"
364,/m/04brg2,"Dishes, pots, and pans"
365,/m/023pjk,"Cutlery, silverware"
366,/m/07pn_8q,"Chopping (food)"
367,/m/0dxrf,"Frying (food)"
368,/m/0fx9l,"Microwave oven"
369,/m/02pjr4,"Blender"
370,/m/02jz0l,"Water tap, faucet"
371,/m/0130jx,"Sink (filling or washing)"
372,/m/03dnzn,"Bathtub (filling or washing)"
373,/m/03wvsk,"Hair dryer"
374,/m/01jt3m,"Toilet flush"
375,/m/012xff,"Toothbrush"
376,/m/04fgwm,"Electric toothbrush"
377,/m/0d31p,"Vacuum cleaner"
378,/m/01s0vc,"Zipper (clothing)"
379,/m/03v3yw,"Keys jangling"
380,/m/0242l,"Coin (dropping)"
381,/m/01lsmm,"Scissors"
382,/m/02g901,"Electric shaver, electric razor"
383,/m/05rj2,"Shuffling cards"
384,/m/0316dw,"Typing"
385,/m/0c2wf,"Typewriter"
386,/m/01m2v,"Computer keyboard"
387,/m/081rb,"Writing"
388,/m/07pp_mv,"Alarm"
389,/m/07cx4,"Telephone"
390,/m/07pp8cl,"Telephone bell ringing"
391,/m/01hnzm,"Ringtone"
392,/m/02c8p,"Telephone dialing, DTMF"
393,/m/015jpf,"Dial tone"
394,/m/01z47d,"Busy signal"
395,/m/046dlr,"Alarm clock"
396,/m/03kmc9,"Siren"
397,/m/0dgbq,"Civil defense siren"
398,/m/030rvx,"Buzzer"
399,/m/01y3hg,"Smoke detector, smoke alarm"
400,/m/0c3f7m,"Fire alarm"
401,/m/04fq5q,"Foghorn"
402,/m/0l156k,"Whistle"
403,/m/06hck5,"Steam whistle"
404,/t/dd00077,"Mechanisms"
405,/m/02bm9n,"Ratchet, pawl"
406,/m/01x3z,"Clock"
407,/m/07qjznt,"Tick"
408,/m/07qjznl,"Tick-tock"
409,/m/0l7xg,"Gears"
410,/m/05zc1,"Pulleys"
411,/m/0llzx,"Sewing machine"
412,/m/02x984l,"Mechanical fan"
413,/m/025wky1,"Air conditioning"
414,/m/024dl,"Cash register"
415,/m/01m4t,"Printer"
416,/m/0dv5r,"Camera"
417,/m/07bjf,"Single-lens reflex camera"
418,/m/07k1x,"Tools"
419,/m/03l9g,"Hammer"
420,/m/03p19w,"Jackhammer"
421,/m/01b82r,"Sawing"
422,/m/02p01q,"Filing (rasp)"
423,/m/023vsd,"Sanding"
424,/m/0_ksk,"Power tool"
425,/m/01d380,"Drill"
426,/m/014zdl,"Explosion"
427,/m/032s66,"Gunshot, gunfire"
428,/m/04zjc,"Machine gun"
429,/m/02z32qm,"Fusillade"
430,/m/0_1c,"Artillery fire"
431,/m/073cg4,"Cap gun"
432,/m/0g6b5,"Fireworks"
433,/g/122z_qxw,"Firecracker"
434,/m/07qsvvw,"Burst, pop"
435,/m/07pxg6y,"Eruption"
436,/m/07qqyl4,"Boom"
437,/m/083vt,"Wood"
438,/m/07pczhz,"Chop"
439,/m/07pl1bw,"Splinter"
440,/m/07qs1cx,"Crack"
441,/m/039jq,"Glass"
442,/m/07q7njn,"Chink, clink"
443,/m/07rn7sz,"Shatter"
444,/m/04k94,"Liquid"
445,/m/07rrlb6,"Splash, splatter"
446,/m/07p6mqd,"Slosh"
447,/m/07qlwh6,"Squish"
448,/m/07r5v4s,"Drip"
449,/m/07prgkl,"Pour"
450,/m/07pqc89,"Trickle, dribble"
451,/t/dd00088,"Gush"
452,/m/07p7b8y,"Fill (with liquid)"
453,/m/07qlf79,"Spray"
454,/m/07ptzwd,"Pump (liquid)"
455,/m/07ptfmf,"Stir"
456,/m/0dv3j,"Boiling"
457,/m/0790c,"Sonar"
458,/m/0dl83,"Arrow"
459,/m/07rqsjt,"Whoosh, swoosh, swish"
460,/m/07qnq_y,"Thump, thud"
461,/m/07rrh0c,"Thunk"
462,/m/0b_fwt,"Electronic tuner"
463,/m/02rr_,"Effects unit"
464,/m/07m2kt,"Chorus effect"
465,/m/018w8,"Basketball bounce"
466,/m/07pws3f,"Bang"
467,/m/07ryjzk,"Slap, smack"
468,/m/07rdhzs,"Whack, thwack"
469,/m/07pjjrj,"Smash, crash"
470,/m/07pc8lb,"Breaking"
471,/m/07pqn27,"Bouncing"
472,/m/07rbp7_,"Whip"
473,/m/07pyf11,"Flap"
474,/m/07qb_dv,"Scratch"
475,/m/07qv4k0,"Scrape"
476,/m/07pdjhy,"Rub"
477,/m/07s8j8t,"Roll"
478,/m/07plct2,"Crushing"
479,/t/dd00112,"Crumpling, crinkling"
480,/m/07qcx4z,"Tearing"
481,/m/02fs_r,"Beep, bleep"
482,/m/07qwdck,"Ping"
483,/m/07phxs1,"Ding"
484,/m/07rv4dm,"Clang"
485,/m/07s02z0,"Squeal"
486,/m/07qh7jl,"Creak"
487,/m/07qwyj0,"Rustle"
488,/m/07s34ls,"Whir"
489,/m/07qmpdm,"Clatter"
490,/m/07p9k1k,"Sizzle"
491,/m/07qc9xj,"Clicking"
492,/m/07rwm0c,"Clickety-clack"
493,/m/07phhsh,"Rumble"
494,/m/07qyrcz,"Plop"
495,/m/07qfgpx,"Jingle, tinkle"
496,/m/07rcgpl,"Hum"
497,/m/07p78v5,"Zing"
498,/t/dd00121,"Boing"
499,/m/07s12q4,"Crunch"
500,/m/028v0c,"Silence"
501,/m/01v_m0,"Sine wave"
502,/m/0b9m1,"Harmonic"
503,/m/0hdsk,"Chirp tone"
504,/m/0c1dj,"Sound effect"
505,/m/07pt_g0,"Pulse"
506,/t/dd00125,"Inside, small room"
507,/t/dd00126,"Inside, large room or hall"
508,/t/dd00127,"Inside, public space"
509,/t/dd00128,"Outside, urban or manmade"
510,/t/dd00129,"Outside, rural or natural"
511,/m/01b9nn,"Reverberation"
512,/m/01jnbd,"Echo"
513,/m/096m7z,"Noise"
514,/m/06_y0by,"Environmental noise"
515,/m/07rgkc5,"Static"
516,/m/06xkwv,"Mains hum"
517,/m/0g12c5,"Distortion"
518,/m/08p9q4,"Sidetone"
519,/m/07szfh9,"Cacophony"
520,/m/0chx_,"White noise"
521,/m/0cj0r,"Pink noise"
522,/m/07p_0gm,"Throbbing"
523,/m/01jwx6,"Vibration"
524,/m/07c52,"Television"
525,/m/06bz3,"Radio"
526,/m/07hvw1,"Field recording"
1 index mid display_name
2 0 /m/09x0r Speech
3 1 /m/05zppz Male speech, man speaking
4 2 /m/02zsn Female speech, woman speaking
5 3 /m/0ytgt Child speech, kid speaking
6 4 /m/01h8n0 Conversation
7 5 /m/02qldy Narration, monologue
8 6 /m/0261r1 Babbling
9 7 /m/0brhx Speech synthesizer
10 8 /m/07p6fty Shout
11 9 /m/07q4ntr Bellow
12 10 /m/07rwj3x Whoop
13 11 /m/07sr1lc Yell
14 12 /m/04gy_2 Battle cry
15 13 /t/dd00135 Children shouting
16 14 /m/03qc9zr Screaming
17 15 /m/02rtxlg Whispering
18 16 /m/01j3sz Laughter
19 17 /t/dd00001 Baby laughter
20 18 /m/07r660_ Giggle
21 19 /m/07s04w4 Snicker
22 20 /m/07sq110 Belly laugh
23 21 /m/07rgt08 Chuckle, chortle
24 22 /m/0463cq4 Crying, sobbing
25 23 /t/dd00002 Baby cry, infant cry
26 24 /m/07qz6j3 Whimper
27 25 /m/07qw_06 Wail, moan
28 26 /m/07plz5l Sigh
29 27 /m/015lz1 Singing
30 28 /m/0l14jd Choir
31 29 /m/01swy6 Yodeling
32 30 /m/02bk07 Chant
33 31 /m/01c194 Mantra
34 32 /t/dd00003 Male singing
35 33 /t/dd00004 Female singing
36 34 /t/dd00005 Child singing
37 35 /t/dd00006 Synthetic singing
38 36 /m/06bxc Rapping
39 37 /m/02fxyj Humming
40 38 /m/07s2xch Groan
41 39 /m/07r4k75 Grunt
42 40 /m/01w250 Whistling
43 41 /m/0lyf6 Breathing
44 42 /m/07mzm6 Wheeze
45 43 /m/01d3sd Snoring
46 44 /m/07s0dtb Gasp
47 45 /m/07pyy8b Pant
48 46 /m/07q0yl5 Snort
49 47 /m/01b_21 Cough
50 48 /m/0dl9sf8 Throat clearing
51 49 /m/01hsr_ Sneeze
52 50 /m/07ppn3j Sniff
53 51 /m/06h7j Run
54 52 /m/07qv_x_ Shuffle
55 53 /m/07pbtc8 Walk, footsteps
56 54 /m/03cczk Chewing, mastication
57 55 /m/07pdhp0 Biting
58 56 /m/0939n_ Gargling
59 57 /m/01g90h Stomach rumble
60 58 /m/03q5_w Burping, eructation
61 59 /m/02p3nc Hiccup
62 60 /m/02_nn Fart
63 61 /m/0k65p Hands
64 62 /m/025_jnm Finger snapping
65 63 /m/0l15bq Clapping
66 64 /m/01jg02 Heart sounds, heartbeat
67 65 /m/01jg1z Heart murmur
68 66 /m/053hz1 Cheering
69 67 /m/028ght Applause
70 68 /m/07rkbfh Chatter
71 69 /m/03qtwd Crowd
72 70 /m/07qfr4h Hubbub, speech noise, speech babble
73 71 /t/dd00013 Children playing
74 72 /m/0jbk Animal
75 73 /m/068hy Domestic animals, pets
76 74 /m/0bt9lr Dog
77 75 /m/05tny_ Bark
78 76 /m/07r_k2n Yip
79 77 /m/07qf0zm Howl
80 78 /m/07rc7d9 Bow-wow
81 79 /m/0ghcn6 Growling
82 80 /t/dd00136 Whimper (dog)
83 81 /m/01yrx Cat
84 82 /m/02yds9 Purr
85 83 /m/07qrkrw Meow
86 84 /m/07rjwbb Hiss
87 85 /m/07r81j2 Caterwaul
88 86 /m/0ch8v Livestock, farm animals, working animals
89 87 /m/03k3r Horse
90 88 /m/07rv9rh Clip-clop
91 89 /m/07q5rw0 Neigh, whinny
92 90 /m/01xq0k1 Cattle, bovinae
93 91 /m/07rpkh9 Moo
94 92 /m/0239kh Cowbell
95 93 /m/068zj Pig
96 94 /t/dd00018 Oink
97 95 /m/03fwl Goat
98 96 /m/07q0h5t Bleat
99 97 /m/07bgp Sheep
100 98 /m/025rv6n Fowl
101 99 /m/09b5t Chicken, rooster
102 100 /m/07st89h Cluck
103 101 /m/07qn5dc Crowing, cock-a-doodle-doo
104 102 /m/01rd7k Turkey
105 103 /m/07svc2k Gobble
106 104 /m/09ddx Duck
107 105 /m/07qdb04 Quack
108 106 /m/0dbvp Goose
109 107 /m/07qwf61 Honk
110 108 /m/01280g Wild animals
111 109 /m/0cdnk Roaring cats (lions, tigers)
112 110 /m/04cvmfc Roar
113 111 /m/015p6 Bird
114 112 /m/020bb7 Bird vocalization, bird call, bird song
115 113 /m/07pggtn Chirp, tweet
116 114 /m/07sx8x_ Squawk
117 115 /m/0h0rv Pigeon, dove
118 116 /m/07r_25d Coo
119 117 /m/04s8yn Crow
120 118 /m/07r5c2p Caw
121 119 /m/09d5_ Owl
122 120 /m/07r_80w Hoot
123 121 /m/05_wcq Bird flight, flapping wings
124 122 /m/01z5f Canidae, dogs, wolves
125 123 /m/06hps Rodents, rats, mice
126 124 /m/04rmv Mouse
127 125 /m/07r4gkf Patter
128 126 /m/03vt0 Insect
129 127 /m/09xqv Cricket
130 128 /m/09f96 Mosquito
131 129 /m/0h2mp Fly, housefly
132 130 /m/07pjwq1 Buzz
133 131 /m/01h3n Bee, wasp, etc.
134 132 /m/09ld4 Frog
135 133 /m/07st88b Croak
136 134 /m/078jl Snake
137 135 /m/07qn4z3 Rattle
138 136 /m/032n05 Whale vocalization
139 137 /m/04rlf Music
140 138 /m/04szw Musical instrument
141 139 /m/0fx80y Plucked string instrument
142 140 /m/0342h Guitar
143 141 /m/02sgy Electric guitar
144 142 /m/018vs Bass guitar
145 143 /m/042v_gx Acoustic guitar
146 144 /m/06w87 Steel guitar, slide guitar
147 145 /m/01glhc Tapping (guitar technique)
148 146 /m/07s0s5r Strum
149 147 /m/018j2 Banjo
150 148 /m/0jtg0 Sitar
151 149 /m/04rzd Mandolin
152 150 /m/01bns_ Zither
153 151 /m/07xzm Ukulele
154 152 /m/05148p4 Keyboard (musical)
155 153 /m/05r5c Piano
156 154 /m/01s0ps Electric piano
157 155 /m/013y1f Organ
158 156 /m/03xq_f Electronic organ
159 157 /m/03gvt Hammond organ
160 158 /m/0l14qv Synthesizer
161 159 /m/01v1d8 Sampler
162 160 /m/03q5t Harpsichord
163 161 /m/0l14md Percussion
164 162 /m/02hnl Drum kit
165 163 /m/0cfdd Drum machine
166 164 /m/026t6 Drum
167 165 /m/06rvn Snare drum
168 166 /m/03t3fj Rimshot
169 167 /m/02k_mr Drum roll
170 168 /m/0bm02 Bass drum
171 169 /m/011k_j Timpani
172 170 /m/01p970 Tabla
173 171 /m/01qbl Cymbal
174 172 /m/03qtq Hi-hat
175 173 /m/01sm1g Wood block
176 174 /m/07brj Tambourine
177 175 /m/05r5wn Rattle (instrument)
178 176 /m/0xzly Maraca
179 177 /m/0mbct Gong
180 178 /m/016622 Tubular bells
181 179 /m/0j45pbj Mallet percussion
182 180 /m/0dwsp Marimba, xylophone
183 181 /m/0dwtp Glockenspiel
184 182 /m/0dwt5 Vibraphone
185 183 /m/0l156b Steelpan
186 184 /m/05pd6 Orchestra
187 185 /m/01kcd Brass instrument
188 186 /m/0319l French horn
189 187 /m/07gql Trumpet
190 188 /m/07c6l Trombone
191 189 /m/0l14_3 Bowed string instrument
192 190 /m/02qmj0d String section
193 191 /m/07y_7 Violin, fiddle
194 192 /m/0d8_n Pizzicato
195 193 /m/01xqw Cello
196 194 /m/02fsn Double bass
197 195 /m/085jw Wind instrument, woodwind instrument
198 196 /m/0l14j_ Flute
199 197 /m/06ncr Saxophone
200 198 /m/01wy6 Clarinet
201 199 /m/03m5k Harp
202 200 /m/0395lw Bell
203 201 /m/03w41f Church bell
204 202 /m/027m70_ Jingle bell
205 203 /m/0gy1t2s Bicycle bell
206 204 /m/07n_g Tuning fork
207 205 /m/0f8s22 Chime
208 206 /m/026fgl Wind chime
209 207 /m/0150b9 Change ringing (campanology)
210 208 /m/03qjg Harmonica
211 209 /m/0mkg Accordion
212 210 /m/0192l Bagpipes
213 211 /m/02bxd Didgeridoo
214 212 /m/0l14l2 Shofar
215 213 /m/07kc_ Theremin
216 214 /m/0l14t7 Singing bowl
217 215 /m/01hgjl Scratching (performance technique)
218 216 /m/064t9 Pop music
219 217 /m/0glt670 Hip hop music
220 218 /m/02cz_7 Beatboxing
221 219 /m/06by7 Rock music
222 220 /m/03lty Heavy metal
223 221 /m/05r6t Punk rock
224 222 /m/0dls3 Grunge
225 223 /m/0dl5d Progressive rock
226 224 /m/07sbbz2 Rock and roll
227 225 /m/05w3f Psychedelic rock
228 226 /m/06j6l Rhythm and blues
229 227 /m/0gywn Soul music
230 228 /m/06cqb Reggae
231 229 /m/01lyv Country
232 230 /m/015y_n Swing music
233 231 /m/0gg8l Bluegrass
234 232 /m/02x8m Funk
235 233 /m/02w4v Folk music
236 234 /m/06j64v Middle Eastern music
237 235 /m/03_d0 Jazz
238 236 /m/026z9 Disco
239 237 /m/0ggq0m Classical music
240 238 /m/05lls Opera
241 239 /m/02lkt Electronic music
242 240 /m/03mb9 House music
243 241 /m/07gxw Techno
244 242 /m/07s72n Dubstep
245 243 /m/0283d Drum and bass
246 244 /m/0m0jc Electronica
247 245 /m/08cyft Electronic dance music
248 246 /m/0fd3y Ambient music
249 247 /m/07lnk Trance music
250 248 /m/0g293 Music of Latin America
251 249 /m/0ln16 Salsa music
252 250 /m/0326g Flamenco
253 251 /m/0155w Blues
254 252 /m/05fw6t Music for children
255 253 /m/02v2lh New-age music
256 254 /m/0y4f8 Vocal music
257 255 /m/0z9c A capella
258 256 /m/0164x2 Music of Africa
259 257 /m/0145m Afrobeat
260 258 /m/02mscn Christian music
261 259 /m/016cjb Gospel music
262 260 /m/028sqc Music of Asia
263 261 /m/015vgc Carnatic music
264 262 /m/0dq0md Music of Bollywood
265 263 /m/06rqw Ska
266 264 /m/02p0sh1 Traditional music
267 265 /m/05rwpb Independent music
268 266 /m/074ft Song
269 267 /m/025td0t Background music
270 268 /m/02cjck Theme music
271 269 /m/03r5q_ Jingle (music)
272 270 /m/0l14gg Soundtrack music
273 271 /m/07pkxdp Lullaby
274 272 /m/01z7dr Video game music
275 273 /m/0140xf Christmas music
276 274 /m/0ggx5q Dance music
277 275 /m/04wptg Wedding music
278 276 /t/dd00031 Happy music
279 277 /t/dd00032 Funny music
280 278 /t/dd00033 Sad music
281 279 /t/dd00034 Tender music
282 280 /t/dd00035 Exciting music
283 281 /t/dd00036 Angry music
284 282 /t/dd00037 Scary music
285 283 /m/03m9d0z Wind
286 284 /m/09t49 Rustling leaves
287 285 /t/dd00092 Wind noise (microphone)
288 286 /m/0jb2l Thunderstorm
289 287 /m/0ngt1 Thunder
290 288 /m/0838f Water
291 289 /m/06mb1 Rain
292 290 /m/07r10fb Raindrop
293 291 /t/dd00038 Rain on surface
294 292 /m/0j6m2 Stream
295 293 /m/0j2kx Waterfall
296 294 /m/05kq4 Ocean
297 295 /m/034srq Waves, surf
298 296 /m/06wzb Steam
299 297 /m/07swgks Gurgling
300 298 /m/02_41 Fire
301 299 /m/07pzfmf Crackle
302 300 /m/07yv9 Vehicle
303 301 /m/019jd Boat, Water vehicle
304 302 /m/0hsrw Sailboat, sailing ship
305 303 /m/056ks2 Rowboat, canoe, kayak
306 304 /m/02rlv9 Motorboat, speedboat
307 305 /m/06q74 Ship
308 306 /m/012f08 Motor vehicle (road)
309 307 /m/0k4j Car
310 308 /m/0912c9 Vehicle horn, car horn, honking
311 309 /m/07qv_d5 Toot
312 310 /m/02mfyn Car alarm
313 311 /m/04gxbd Power windows, electric windows
314 312 /m/07rknqz Skidding
315 313 /m/0h9mv Tire squeal
316 314 /t/dd00134 Car passing by
317 315 /m/0ltv Race car, auto racing
318 316 /m/07r04 Truck
319 317 /m/0gvgw0 Air brake
320 318 /m/05x_td Air horn, truck horn
321 319 /m/02rhddq Reversing beeps
322 320 /m/03cl9h Ice cream truck, ice cream van
323 321 /m/01bjv Bus
324 322 /m/03j1ly Emergency vehicle
325 323 /m/04qvtq Police car (siren)
326 324 /m/012n7d Ambulance (siren)
327 325 /m/012ndj Fire engine, fire truck (siren)
328 326 /m/04_sv Motorcycle
329 327 /m/0btp2 Traffic noise, roadway noise
330 328 /m/06d_3 Rail transport
331 329 /m/07jdr Train
332 330 /m/04zmvq Train whistle
333 331 /m/0284vy3 Train horn
334 332 /m/01g50p Railroad car, train wagon
335 333 /t/dd00048 Train wheels squealing
336 334 /m/0195fx Subway, metro, underground
337 335 /m/0k5j Aircraft
338 336 /m/014yck Aircraft engine
339 337 /m/04229 Jet engine
340 338 /m/02l6bg Propeller, airscrew
341 339 /m/09ct_ Helicopter
342 340 /m/0cmf2 Fixed-wing aircraft, airplane
343 341 /m/0199g Bicycle
344 342 /m/06_fw Skateboard
345 343 /m/02mk9 Engine
346 344 /t/dd00065 Light engine (high frequency)
347 345 /m/08j51y Dental drill, dentist's drill
348 346 /m/01yg9g Lawn mower
349 347 /m/01j4z9 Chainsaw
350 348 /t/dd00066 Medium engine (mid frequency)
351 349 /t/dd00067 Heavy engine (low frequency)
352 350 /m/01h82_ Engine knocking
353 351 /t/dd00130 Engine starting
354 352 /m/07pb8fc Idling
355 353 /m/07q2z82 Accelerating, revving, vroom
356 354 /m/02dgv Door
357 355 /m/03wwcy Doorbell
358 356 /m/07r67yg Ding-dong
359 357 /m/02y_763 Sliding door
360 358 /m/07rjzl8 Slam
361 359 /m/07r4wb8 Knock
362 360 /m/07qcpgn Tap
363 361 /m/07q6cd_ Squeak
364 362 /m/0642b4 Cupboard open or close
365 363 /m/0fqfqc Drawer open or close
366 364 /m/04brg2 Dishes, pots, and pans
367 365 /m/023pjk Cutlery, silverware
368 366 /m/07pn_8q Chopping (food)
369 367 /m/0dxrf Frying (food)
370 368 /m/0fx9l Microwave oven
371 369 /m/02pjr4 Blender
372 370 /m/02jz0l Water tap, faucet
373 371 /m/0130jx Sink (filling or washing)
374 372 /m/03dnzn Bathtub (filling or washing)
375 373 /m/03wvsk Hair dryer
376 374 /m/01jt3m Toilet flush
377 375 /m/012xff Toothbrush
378 376 /m/04fgwm Electric toothbrush
379 377 /m/0d31p Vacuum cleaner
380 378 /m/01s0vc Zipper (clothing)
381 379 /m/03v3yw Keys jangling
382 380 /m/0242l Coin (dropping)
383 381 /m/01lsmm Scissors
384 382 /m/02g901 Electric shaver, electric razor
385 383 /m/05rj2 Shuffling cards
386 384 /m/0316dw Typing
387 385 /m/0c2wf Typewriter
388 386 /m/01m2v Computer keyboard
389 387 /m/081rb Writing
390 388 /m/07pp_mv Alarm
391 389 /m/07cx4 Telephone
392 390 /m/07pp8cl Telephone bell ringing
393 391 /m/01hnzm Ringtone
394 392 /m/02c8p Telephone dialing, DTMF
395 393 /m/015jpf Dial tone
396 394 /m/01z47d Busy signal
397 395 /m/046dlr Alarm clock
398 396 /m/03kmc9 Siren
399 397 /m/0dgbq Civil defense siren
400 398 /m/030rvx Buzzer
401 399 /m/01y3hg Smoke detector, smoke alarm
402 400 /m/0c3f7m Fire alarm
403 401 /m/04fq5q Foghorn
404 402 /m/0l156k Whistle
405 403 /m/06hck5 Steam whistle
406 404 /t/dd00077 Mechanisms
407 405 /m/02bm9n Ratchet, pawl
408 406 /m/01x3z Clock
409 407 /m/07qjznt Tick
410 408 /m/07qjznl Tick-tock
411 409 /m/0l7xg Gears
412 410 /m/05zc1 Pulleys
413 411 /m/0llzx Sewing machine
414 412 /m/02x984l Mechanical fan
415 413 /m/025wky1 Air conditioning
416 414 /m/024dl Cash register
417 415 /m/01m4t Printer
418 416 /m/0dv5r Camera
419 417 /m/07bjf Single-lens reflex camera
420 418 /m/07k1x Tools
421 419 /m/03l9g Hammer
422 420 /m/03p19w Jackhammer
423 421 /m/01b82r Sawing
424 422 /m/02p01q Filing (rasp)
425 423 /m/023vsd Sanding
426 424 /m/0_ksk Power tool
427 425 /m/01d380 Drill
428 426 /m/014zdl Explosion
429 427 /m/032s66 Gunshot, gunfire
430 428 /m/04zjc Machine gun
431 429 /m/02z32qm Fusillade
432 430 /m/0_1c Artillery fire
433 431 /m/073cg4 Cap gun
434 432 /m/0g6b5 Fireworks
435 433 /g/122z_qxw Firecracker
436 434 /m/07qsvvw Burst, pop
437 435 /m/07pxg6y Eruption
438 436 /m/07qqyl4 Boom
439 437 /m/083vt Wood
440 438 /m/07pczhz Chop
441 439 /m/07pl1bw Splinter
442 440 /m/07qs1cx Crack
443 441 /m/039jq Glass
444 442 /m/07q7njn Chink, clink
445 443 /m/07rn7sz Shatter
446 444 /m/04k94 Liquid
447 445 /m/07rrlb6 Splash, splatter
448 446 /m/07p6mqd Slosh
449 447 /m/07qlwh6 Squish
450 448 /m/07r5v4s Drip
451 449 /m/07prgkl Pour
452 450 /m/07pqc89 Trickle, dribble
453 451 /t/dd00088 Gush
454 452 /m/07p7b8y Fill (with liquid)
455 453 /m/07qlf79 Spray
456 454 /m/07ptzwd Pump (liquid)
457 455 /m/07ptfmf Stir
458 456 /m/0dv3j Boiling
459 457 /m/0790c Sonar
460 458 /m/0dl83 Arrow
461 459 /m/07rqsjt Whoosh, swoosh, swish
462 460 /m/07qnq_y Thump, thud
463 461 /m/07rrh0c Thunk
464 462 /m/0b_fwt Electronic tuner
465 463 /m/02rr_ Effects unit
466 464 /m/07m2kt Chorus effect
467 465 /m/018w8 Basketball bounce
468 466 /m/07pws3f Bang
469 467 /m/07ryjzk Slap, smack
470 468 /m/07rdhzs Whack, thwack
471 469 /m/07pjjrj Smash, crash
472 470 /m/07pc8lb Breaking
473 471 /m/07pqn27 Bouncing
474 472 /m/07rbp7_ Whip
475 473 /m/07pyf11 Flap
476 474 /m/07qb_dv Scratch
477 475 /m/07qv4k0 Scrape
478 476 /m/07pdjhy Rub
479 477 /m/07s8j8t Roll
480 478 /m/07plct2 Crushing
481 479 /t/dd00112 Crumpling, crinkling
482 480 /m/07qcx4z Tearing
483 481 /m/02fs_r Beep, bleep
484 482 /m/07qwdck Ping
485 483 /m/07phxs1 Ding
486 484 /m/07rv4dm Clang
487 485 /m/07s02z0 Squeal
488 486 /m/07qh7jl Creak
489 487 /m/07qwyj0 Rustle
490 488 /m/07s34ls Whir
491 489 /m/07qmpdm Clatter
492 490 /m/07p9k1k Sizzle
493 491 /m/07qc9xj Clicking
494 492 /m/07rwm0c Clickety-clack
495 493 /m/07phhsh Rumble
496 494 /m/07qyrcz Plop
497 495 /m/07qfgpx Jingle, tinkle
498 496 /m/07rcgpl Hum
499 497 /m/07p78v5 Zing
500 498 /t/dd00121 Boing
501 499 /m/07s12q4 Crunch
502 500 /m/028v0c Silence
503 501 /m/01v_m0 Sine wave
504 502 /m/0b9m1 Harmonic
505 503 /m/0hdsk Chirp tone
506 504 /m/0c1dj Sound effect
507 505 /m/07pt_g0 Pulse
508 506 /t/dd00125 Inside, small room
509 507 /t/dd00126 Inside, large room or hall
510 508 /t/dd00127 Inside, public space
511 509 /t/dd00128 Outside, urban or manmade
512 510 /t/dd00129 Outside, rural or natural
513 511 /m/01b9nn Reverberation
514 512 /m/01jnbd Echo
515 513 /m/096m7z Noise
516 514 /m/06_y0by Environmental noise
517 515 /m/07rgkc5 Static
518 516 /m/06xkwv Mains hum
519 517 /m/0g12c5 Distortion
520 518 /m/08p9q4 Sidetone
521 519 /m/07szfh9 Cacophony
522 520 /m/0chx_ White noise
523 521 /m/0cj0r Pink noise
524 522 /m/07p_0gm Throbbing
525 523 /m/01jwx6 Vibration
526 524 /m/07c52 Television
527 525 /m/06bz3 Radio
528 526 /m/07hvw1 Field recording

View File

@@ -0,0 +1,42 @@
from sklearn import metrics
from pytorch_utils import forward
class Evaluator(object):
def __init__(self, model):
"""Evaluator.
Args:
model: object
"""
self.model = model
def evaluate(self, data_loader):
"""Forward evaluation data and calculate statistics.
Args:
data_loader: object
Returns:
statistics: dict,
{'average_precision': (classes_num,), 'auc': (classes_num,)}
"""
# Forward
output_dict = forward(
model=self.model,
generator=data_loader,
return_target=True)
clipwise_output = output_dict['clipwise_output'] # (audios_num, classes_num)
target = output_dict['target'] # (audios_num, classes_num)
average_precision = metrics.average_precision_score(
target, clipwise_output, average=None)
auc = metrics.roc_auc_score(target, clipwise_output, average=None)
statistics = {'average_precision': average_precision, 'auc': auc}
return statistics

View File

@@ -0,0 +1,127 @@
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
import numpy as np
import argparse
import h5py
import math
import time
import logging
import matplotlib.pyplot as plt
import torch
torch.backends.cudnn.benchmark=True
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from utilities import get_filename
from models import *
import config
class Transfer_Cnn14(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num, freeze_base):
"""Classifier for a new task using pretrained Cnn14 as a sub module.
"""
super(Transfer_Cnn14, self).__init__()
audioset_classes_num = 527
self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, audioset_classes_num)
# Transfer to another task layer
self.fc_transfer = nn.Linear(2048, classes_num, bias=True)
if freeze_base:
# Freeze AudioSet pretrained layers
for param in self.base.parameters():
param.requires_grad = False
self.init_weights()
def init_weights(self):
init_layer(self.fc_transfer)
def load_from_pretrain(self, pretrained_checkpoint_path):
checkpoint = torch.load(pretrained_checkpoint_path)
self.base.load_state_dict(checkpoint['model'])
def forward(self, input, mixup_lambda=None):
"""Input: (batch_size, data_length)
"""
output_dict = self.base(input, mixup_lambda)
embedding = output_dict['embedding']
clipwise_output = torch.log_softmax(self.fc_transfer(embedding), dim=-1)
output_dict['clipwise_output'] = clipwise_output
return output_dict
def train(args):
# Arugments & parameters
sample_rate = args.sample_rate
window_size = args.window_size
hop_size = args.hop_size
mel_bins = args.mel_bins
fmin = args.fmin
fmax = args.fmax
model_type = args.model_type
pretrained_checkpoint_path = args.pretrained_checkpoint_path
freeze_base = args.freeze_base
device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu'
classes_num = config.classes_num
pretrain = True if pretrained_checkpoint_path else False
# Model
Model = eval(model_type)
model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax,
classes_num, freeze_base)
# Load pretrained model
if pretrain:
logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
model.load_from_pretrain(pretrained_checkpoint_path)
# Parallel
print('GPU number: {}'.format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
if 'cuda' in device:
model.to(device)
print('Load pretrained model successfully!')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Example of parser. ')
subparsers = parser.add_subparsers(dest='mode')
# Train
parser_train = subparsers.add_parser('train')
parser_train.add_argument('--sample_rate', type=int, required=True)
parser_train.add_argument('--window_size', type=int, required=True)
parser_train.add_argument('--hop_size', type=int, required=True)
parser_train.add_argument('--mel_bins', type=int, required=True)
parser_train.add_argument('--fmin', type=int, required=True)
parser_train.add_argument('--fmax', type=int, required=True)
parser_train.add_argument('--model_type', type=str, required=True)
parser_train.add_argument('--pretrained_checkpoint_path', type=str)
parser_train.add_argument('--freeze_base', action='store_true', default=False)
parser_train.add_argument('--cuda', action='store_true', default=False)
# Parse arguments
args = parser.parse_args()
args.filename = get_filename(__file__)
if args.mode == 'train':
train(args)
else:
raise Exception('Error argument!')

View File

@@ -0,0 +1,206 @@
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
import numpy as np
import argparse
import librosa
import matplotlib.pyplot as plt
import torch
from utilities import create_folder, get_filename
from models import *
from pytorch_utils import move_data_to_device
import config
def audio_tagging(args):
"""Inference audio tagging result of an audio clip.
"""
# Arugments & parameters
sample_rate = args.sample_rate
window_size = args.window_size
hop_size = args.hop_size
mel_bins = args.mel_bins
fmin = args.fmin
fmax = args.fmax
model_type = args.model_type
checkpoint_path = args.checkpoint_path
audio_path = args.audio_path
device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
classes_num = config.classes_num
labels = config.labels
# Model
Model = eval(model_type)
model = Model(sample_rate=sample_rate, window_size=window_size,
hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
classes_num=classes_num)
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model'])
# Parallel
if 'cuda' in str(device):
model.to(device)
print('GPU number: {}'.format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
else:
print('Using CPU.')
# Load audio
(waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
waveform = waveform[None, :] # (1, audio_length)
waveform = move_data_to_device(waveform, device)
# Forward
with torch.no_grad():
model.eval()
batch_output_dict = model(waveform, None)
clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0]
"""(classes_num,)"""
sorted_indexes = np.argsort(clipwise_output)[::-1]
# Print audio tagging top probabilities
for k in range(10):
print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]],
clipwise_output[sorted_indexes[k]]))
# Print embedding
if 'embedding' in batch_output_dict.keys():
embedding = batch_output_dict['embedding'].data.cpu().numpy()[0]
print('embedding: {}'.format(embedding.shape))
return clipwise_output, labels
def sound_event_detection(args):
"""Inference sound event detection result of an audio clip.
"""
# Arugments & parameters
sample_rate = args.sample_rate
window_size = args.window_size
hop_size = args.hop_size
mel_bins = args.mel_bins
fmin = args.fmin
fmax = args.fmax
model_type = args.model_type
checkpoint_path = args.checkpoint_path
audio_path = args.audio_path
device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
classes_num = config.classes_num
labels = config.labels
frames_per_second = sample_rate // hop_size
# Paths
fig_path = os.path.join('results', '{}.png'.format(get_filename(audio_path)))
create_folder(os.path.dirname(fig_path))
# Model
Model = eval(model_type)
model = Model(sample_rate=sample_rate, window_size=window_size,
hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
classes_num=classes_num)
checkpoint = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint['model'])
# Parallel
print('GPU number: {}'.format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
if 'cuda' in str(device):
model.to(device)
# Load audio
(waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
waveform = waveform[None, :] # (1, audio_length)
waveform = move_data_to_device(waveform, device)
# Forward
with torch.no_grad():
model.eval()
batch_output_dict = model(waveform, None)
framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
"""(time_steps, classes_num)"""
print('Sound event detection result (time_steps x classes_num): {}'.format(
framewise_output.shape))
sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
top_k = 10 # Show top results
top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]
"""(time_steps, top_k)"""
# Plot result
stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=window_size,
hop_length=hop_size, window='hann', center=True)
frames_num = stft.shape[-1]
fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
axs[0].set_ylabel('Frequency bins')
axs[0].set_title('Log spectrogram')
axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
axs[1].xaxis.set_ticks(np.arange(0, frames_num, frames_per_second))
axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / frames_per_second))
axs[1].yaxis.set_ticks(np.arange(0, top_k))
axs[1].yaxis.set_ticklabels(np.array(labels)[sorted_indexes[0 : top_k]])
axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
axs[1].set_xlabel('Seconds')
axs[1].xaxis.set_ticks_position('bottom')
plt.tight_layout()
plt.savefig(fig_path)
print('Save sound event detection visualization to {}'.format(fig_path))
return framewise_output, labels
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Example of parser. ')
subparsers = parser.add_subparsers(dest='mode')
parser_at = subparsers.add_parser('audio_tagging')
parser_at.add_argument('--sample_rate', type=int, default=32000)
parser_at.add_argument('--window_size', type=int, default=1024)
parser_at.add_argument('--hop_size', type=int, default=320)
parser_at.add_argument('--mel_bins', type=int, default=64)
parser_at.add_argument('--fmin', type=int, default=50)
parser_at.add_argument('--fmax', type=int, default=14000)
parser_at.add_argument('--model_type', type=str, required=True)
parser_at.add_argument('--checkpoint_path', type=str, required=True)
parser_at.add_argument('--audio_path', type=str, required=True)
parser_at.add_argument('--cuda', action='store_true', default=False)
parser_sed = subparsers.add_parser('sound_event_detection')
parser_sed.add_argument('--sample_rate', type=int, default=32000)
parser_sed.add_argument('--window_size', type=int, default=1024)
parser_sed.add_argument('--hop_size', type=int, default=320)
parser_sed.add_argument('--mel_bins', type=int, default=64)
parser_sed.add_argument('--fmin', type=int, default=50)
parser_sed.add_argument('--fmax', type=int, default=14000)
parser_sed.add_argument('--model_type', type=str, required=True)
parser_sed.add_argument('--checkpoint_path', type=str, required=True)
parser_sed.add_argument('--audio_path', type=str, required=True)
parser_sed.add_argument('--cuda', action='store_true', default=False)
args = parser.parse_args()
if args.mode == 'audio_tagging':
audio_tagging(args)
elif args.mode == 'sound_event_detection':
sound_event_detection(args)
else:
raise Exception('Error argument!')

View File

@@ -0,0 +1,14 @@
import torch
import torch.nn.functional as F
def clip_bce(output_dict, target_dict):
"""Binary crossentropy loss.
"""
return F.binary_cross_entropy(
output_dict['clipwise_output'], target_dict['target'])
def get_loss_func(loss_type):
if loss_type == 'clip_bce':
return clip_bce

View File

@@ -0,0 +1,378 @@
import os
import sys
sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
import numpy as np
import argparse
import time
import logging
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from utilities import (create_folder, get_filename, create_logging, Mixup,
StatisticsContainer)
from models import (PVT, PVT2, PVT_lr, PVT_nopretrain, PVT_2layer, Cnn14, Cnn14_no_specaug, Cnn14_no_dropout,
Cnn6, Cnn10, ResNet22, ResNet38, ResNet54, Cnn14_emb512, Cnn14_emb128,
Cnn14_emb32, MobileNetV1, MobileNetV2, LeeNet11, LeeNet24, DaiNet19,
Res1dNet31, Res1dNet51, Wavegram_Cnn14, Wavegram_Logmel_Cnn14,
Wavegram_Logmel128_Cnn14, Cnn14_16k, Cnn14_8k, Cnn14_mel32, Cnn14_mel128,
Cnn14_mixup_time_domain, Cnn14_DecisionLevelMax, Cnn14_DecisionLevelAtt, Cnn6_Transformer, GLAM, GLAM2, GLAM3, Cnn4, EAT)
#from models_test import (PVT_test)
#from models1 import (PVT1)
#from models_vig import (VIG, VIG2)
#from models_vvt import (VVT)
#from models2 import (MPVIT, MPVIT2)
#from models_reshape import (PVT_reshape, PVT_tscam)
#from models_swin import (Swin, Swin_nopretrain)
#from models_swin2 import (Swin2)
#from models_van import (Van, Van_tiny)
#from models_focal import (Focal)
#from models_cross import (Cross)
#from models_cov import (Cov)
#from models_cnn import (Cnn_light)
#from models_twins import (Twins)
#from models_cmt import (Cmt, Cmt1)
#from models_shunted import (Shunted)
#from models_quadtree import (Quadtree, Quadtree2, Quadtree_nopretrain)
#from models_davit import (Davit_tscam, Davit, Davit_nopretrain)
from pytorch_utils import (move_data_to_device, count_parameters, count_flops,
do_mixup)
from data_generator import (AudioSetDataset, TrainSampler, BalancedTrainSampler,
AlternateTrainSampler, EvaluateSampler, collate_fn)
from evaluate import Evaluator
import config
from losses import get_loss_func
def train(args):
"""Train AudioSet tagging model.
Args:
dataset_dir: str
workspace: str
data_type: 'balanced_train' | 'full_train'
window_size: int
hop_size: int
mel_bins: int
model_type: str
loss_type: 'clip_bce'
balanced: 'none' | 'balanced' | 'alternate'
augmentation: 'none' | 'mixup'
batch_size: int
learning_rate: float
resume_iteration: int
early_stop: int
accumulation_steps: int
cuda: bool
"""
# Arugments & parameters
workspace = args.workspace
data_type = args.data_type
sample_rate = args.sample_rate
window_size = args.window_size
hop_size = args.hop_size
mel_bins = args.mel_bins
fmin = args.fmin
fmax = args.fmax
model_type = args.model_type
loss_type = args.loss_type
balanced = args.balanced
augmentation = args.augmentation
batch_size = args.batch_size
learning_rate = args.learning_rate
resume_iteration = args.resume_iteration
early_stop = args.early_stop
device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
filename = args.filename
num_workers = 8
clip_samples = config.clip_samples
classes_num = config.classes_num
loss_func = get_loss_func(loss_type)
# Paths
black_list_csv = None
train_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes',
'{}.h5'.format(data_type))
eval_bal_indexes_hdf5_path = os.path.join(workspace,
'hdf5s', 'indexes', 'balanced_train.h5')
eval_test_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes',
'eval.h5')
checkpoints_dir = os.path.join(workspace, 'checkpoints', filename,
'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
'data_type={}'.format(data_type), model_type,
'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
create_folder(checkpoints_dir)
statistics_path = os.path.join(workspace, 'statistics', filename,
'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
'data_type={}'.format(data_type), model_type,
'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
'statistics.pkl')
create_folder(os.path.dirname(statistics_path))
logs_dir = os.path.join(workspace, 'logs', filename,
'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
'data_type={}'.format(data_type), model_type,
'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
create_logging(logs_dir, filemode='w')
logging.info(args)
if 'cuda' in str(device):
logging.info('Using GPU.')
device = 'cuda'
else:
logging.info('Using CPU. Set --cuda flag to use GPU.')
device = 'cpu'
# Model
Model = eval(model_type)
model = Model(sample_rate=sample_rate, window_size=window_size,
hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax,
classes_num=classes_num)
total = sum(p.numel() for p in model.parameters())
print("Total params: %.2fM" % (total/1e6))
logging.info("Total params: %.2fM" % (total/1e6))
#params_num = count_parameters(model)
# flops_num = count_flops(model, clip_samples)
#logging.info('Parameters num: {}'.format(params_num))
# logging.info('Flops num: {:.3f} G'.format(flops_num / 1e9))
# Dataset will be used by DataLoader later. Dataset takes a meta as input
# and return a waveform and a target.
dataset = AudioSetDataset(sample_rate=sample_rate)
# Train sampler
if balanced == 'none':
Sampler = TrainSampler
elif balanced == 'balanced':
Sampler = BalancedTrainSampler
elif balanced == 'alternate':
Sampler = AlternateTrainSampler
train_sampler = Sampler(
indexes_hdf5_path=train_indexes_hdf5_path,
batch_size=batch_size * 2 if 'mixup' in augmentation else batch_size,
black_list_csv=black_list_csv)
# Evaluate sampler
eval_bal_sampler = EvaluateSampler(
indexes_hdf5_path=eval_bal_indexes_hdf5_path, batch_size=batch_size)
eval_test_sampler = EvaluateSampler(
indexes_hdf5_path=eval_test_indexes_hdf5_path, batch_size=batch_size)
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=dataset,
batch_sampler=train_sampler, collate_fn=collate_fn,
num_workers=num_workers, pin_memory=True)
eval_bal_loader = torch.utils.data.DataLoader(dataset=dataset,
batch_sampler=eval_bal_sampler, collate_fn=collate_fn,
num_workers=num_workers, pin_memory=True)
eval_test_loader = torch.utils.data.DataLoader(dataset=dataset,
batch_sampler=eval_test_sampler, collate_fn=collate_fn,
num_workers=num_workers, pin_memory=True)
mix=0.5
if 'mixup' in augmentation:
mixup_augmenter = Mixup(mixup_alpha=mix)
print(mix)
logging.info(mix)
# Evaluator
evaluator = Evaluator(model=model)
# Statistics
statistics_container = StatisticsContainer(statistics_path)
# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.05, amsgrad=True)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, min_lr=1e-06, verbose=True)
train_bgn_time = time.time()
# Resume training
if resume_iteration > 0:
resume_checkpoint_path = os.path.join(workspace, 'checkpoints', filename,
'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
sample_rate, window_size, hop_size, mel_bins, fmin, fmax),
'data_type={}'.format(data_type), model_type,
'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced),
'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size),
'{}_iterations.pth'.format(resume_iteration))
logging.info('Loading checkpoint {}'.format(resume_checkpoint_path))
checkpoint = torch.load(resume_checkpoint_path)
model.load_state_dict(checkpoint['model'])
train_sampler.load_state_dict(checkpoint['sampler'])
statistics_container.load_state_dict(resume_iteration)
iteration = checkpoint['iteration']
else:
iteration = 0
# Parallel
print('GPU number: {}'.format(torch.cuda.device_count()))
model = torch.nn.DataParallel(model)
if 'cuda' in str(device):
model.to(device)
if resume_iteration:
optimizer.load_state_dict(checkpoint['optimizer'])
scheduler.load_state_dict(checkpoint['scheduler'])
print(optimizer.state_dict()['param_groups'][0]['lr'])
time1 = time.time()
for batch_data_dict in train_loader:
"""batch_data_dict: {
'audio_name': (batch_size [*2 if mixup],),
'waveform': (batch_size [*2 if mixup], clip_samples),
'target': (batch_size [*2 if mixup], classes_num),
(ifexist) 'mixup_lambda': (batch_size * 2,)}
"""
# Evaluate
if (iteration % 2000 == 0 and iteration >= resume_iteration) or (iteration == 0):
train_fin_time = time.time()
bal_statistics = evaluator.evaluate(eval_bal_loader)
test_statistics = evaluator.evaluate(eval_test_loader)
logging.info('Validate bal mAP: {:.3f}'.format(
np.mean(bal_statistics['average_precision'])))
logging.info('Validate test mAP: {:.3f}'.format(
np.mean(test_statistics['average_precision'])))
statistics_container.append(iteration, bal_statistics, data_type='bal')
statistics_container.append(iteration, test_statistics, data_type='test')
statistics_container.dump()
train_time = train_fin_time - train_bgn_time
validate_time = time.time() - train_fin_time
logging.info(
'iteration: {}, train time: {:.3f} s, validate time: {:.3f} s'
''.format(iteration, train_time, validate_time))
logging.info('------------------------------------')
train_bgn_time = time.time()
# Save model
if iteration % 2000 == 0:
checkpoint = {
'iteration': iteration,
'model': model.module.state_dict(),
'sampler': train_sampler.state_dict(),
'optimizer': optimizer.state_dict(),
'scheduler': scheduler.state_dict()}
checkpoint_path = os.path.join(
checkpoints_dir, '{}_iterations.pth'.format(iteration))
torch.save(checkpoint, checkpoint_path)
logging.info('Model saved to {}'.format(checkpoint_path))
# Mixup lambda
if 'mixup' in augmentation:
batch_data_dict['mixup_lambda'] = mixup_augmenter.get_lambda(
batch_size=len(batch_data_dict['waveform']))
# Move data to device
for key in batch_data_dict.keys():
batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)
# Forward
model.train()
if 'mixup' in augmentation:
batch_output_dict = model(batch_data_dict['waveform'],
batch_data_dict['mixup_lambda'])
"""{'clipwise_output': (batch_size, classes_num), ...}"""
batch_target_dict = {'target': do_mixup(batch_data_dict['target'],
batch_data_dict['mixup_lambda'])}
"""{'target': (batch_size, classes_num)}"""
else:
batch_output_dict = model(batch_data_dict['waveform'], None)
"""{'clipwise_output': (batch_size, classes_num), ...}"""
batch_target_dict = {'target': batch_data_dict['target']}
"""{'target': (batch_size, classes_num)}"""
# Loss
loss = loss_func(batch_output_dict, batch_target_dict)
# Backward
loss.backward()
optimizer.step()
optimizer.zero_grad()
if iteration % 10 == 0:
print(iteration, loss)
#print('--- Iteration: {}, train time: {:.3f} s / 10 iterations ---'\
# .format(iteration, time.time() - time1))
#time1 = time.time()
if iteration % 2000 == 0:
scheduler.step(np.mean(test_statistics['average_precision']))
print(optimizer.state_dict()['param_groups'][0]['lr'])
logging.info(optimizer.state_dict()['param_groups'][0]['lr'])
# Stop learning
if iteration == early_stop:
break
iteration += 1
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Example of parser. ')
subparsers = parser.add_subparsers(dest='mode')
parser_train = subparsers.add_parser('train')
parser_train.add_argument('--workspace', type=str, required=True)
parser_train.add_argument('--data_type', type=str, default='full_train', choices=['balanced_train', 'full_train'])
parser_train.add_argument('--sample_rate', type=int, default=32000)
parser_train.add_argument('--window_size', type=int, default=1024)
parser_train.add_argument('--hop_size', type=int, default=320)
parser_train.add_argument('--mel_bins', type=int, default=64)
parser_train.add_argument('--fmin', type=int, default=50)
parser_train.add_argument('--fmax', type=int, default=14000)
parser_train.add_argument('--model_type', type=str, required=True)
parser_train.add_argument('--loss_type', type=str, default='clip_bce', choices=['clip_bce'])
parser_train.add_argument('--balanced', type=str, default='balanced', choices=['none', 'balanced', 'alternate'])
parser_train.add_argument('--augmentation', type=str, default='mixup', choices=['none', 'mixup'])
parser_train.add_argument('--batch_size', type=int, default=32)
parser_train.add_argument('--learning_rate', type=float, default=1e-3)
parser_train.add_argument('--resume_iteration', type=int, default=0)
parser_train.add_argument('--early_stop', type=int, default=1000000)
parser_train.add_argument('--cuda', action='store_true', default=False)
args = parser.parse_args()
args.filename = get_filename(__file__)
if args.mode == 'train':
train(args)
else:
raise Exception('Error argument!')

View File

@@ -0,0 +1,951 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation
from audio_infer.pytorch.pytorch_utils import do_mixup, interpolate, pad_framewise_output
import os
import sys
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation
from audio_infer.pytorch.pytorch_utils import do_mixup
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
import warnings
from functools import partial
#from mmdet.models.builder import BACKBONES
from mmdet.utils import get_root_logger
from mmcv.runner import load_checkpoint
os.environ['TORCH_HOME'] = '../pretrained_models'
from copy import deepcopy
from timm.models.helpers import load_pretrained
from torch.cuda.amp import autocast
from collections import OrderedDict
import io
import re
from mmcv.runner import _load_checkpoint, load_state_dict
import mmcv.runner
import copy
import random
from einops import rearrange
from einops.layers.torch import Rearrange, Reduce
from torch import nn, einsum
def load_checkpoint(model,
filename,
map_location=None,
strict=False,
logger=None,
revise_keys=[(r'^module\.', '')]):
"""Load checkpoint from a file or URI.
Args:
model (Module): Module to load checkpoint.
filename (str): Accept local filepath, URL, ``torchvision://xxx``,
``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
details.
map_location (str): Same as :func:`torch.load`.
strict (bool): Whether to allow different params for the model and
checkpoint.
logger (:mod:`logging.Logger` or None): The logger for error message.
revise_keys (list): A list of customized keywords to modify the
state_dict in checkpoint. Each item is a (pattern, replacement)
pair of the regular expression operations. Default: strip
the prefix 'module.' by [(r'^module\\.', '')].
Returns:
dict or OrderedDict: The loaded checkpoint.
"""
checkpoint = _load_checkpoint(filename, map_location, logger)
new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
checkpoint['patch_embed1.proj.weight'] = new_proj.weight
# OrderedDict is a subclass of dict
if not isinstance(checkpoint, dict):
raise RuntimeError(
f'No state_dict found in checkpoint file {filename}')
# get state_dict from checkpoint
if 'state_dict' in checkpoint:
state_dict = checkpoint['state_dict']
else:
state_dict = checkpoint
# strip prefix of state_dict
metadata = getattr(state_dict, '_metadata', OrderedDict())
for p, r in revise_keys:
state_dict = OrderedDict(
{re.sub(p, r, k): v
for k, v in state_dict.items()})
state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
# Keep metadata in state_dict
state_dict._metadata = metadata
# load state_dict
load_state_dict(model, state_dict, strict, logger)
return checkpoint
def init_layer(layer):
"""Initialize a Linear or Convolutional layer. """
nn.init.xavier_uniform_(layer.weight)
if hasattr(layer, 'bias'):
if layer.bias is not None:
layer.bias.data.fill_(0.)
def init_bn(bn):
"""Initialize a Batchnorm layer. """
bn.bias.data.fill_(0.)
bn.weight.data.fill_(1.)
class TimeShift(nn.Module):
def __init__(self, mean, std):
super().__init__()
self.mean = mean
self.std = std
def forward(self, x):
if self.training:
shift = torch.empty(1).normal_(self.mean, self.std).int().item()
x = torch.roll(x, shift, dims=2)
return x
class LinearSoftPool(nn.Module):
"""LinearSoftPool
Linear softmax, takes logits and returns a probability, near to the actual maximum value.
Taken from the paper:
A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
https://arxiv.org/abs/1810.09050
"""
def __init__(self, pooldim=1):
super().__init__()
self.pooldim = pooldim
def forward(self, logits, time_decision):
return (time_decision**2).sum(self.pooldim) / time_decision.sum(
self.pooldim)
class PVT(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num):
super(PVT, self).__init__()
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None
# Spectrogram extractor
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
freeze_parameters=True)
# Logmel feature extractor
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
freeze_parameters=True)
self.time_shift = TimeShift(0, 10)
# Spec augmenter
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
freq_drop_width=8, freq_stripes_num=2)
self.bn0 = nn.BatchNorm2d(64)
self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
fdim=64,
patch_size=7,
stride=4,
in_chans=1,
num_classes=classes_num,
embed_dims=[64, 128, 320, 512],
depths=[3, 4, 6, 3],
num_heads=[1, 2, 5, 8],
mlp_ratios=[8, 8, 4, 4],
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
drop_path_rate=0.1,
sr_ratios=[8, 4, 2, 1],
norm_layer=partial(nn.LayerNorm, eps=1e-6),
num_stages=4,
#pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
)
#self.temp_pool = LinearSoftPool()
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.fc_audioset = nn.Linear(512, classes_num, bias=True)
self.init_weights()
def init_weights(self):
init_bn(self.bn0)
init_layer(self.fc_audioset)
def forward(self, input, mixup_lambda=None):
"""Input: (batch_size, times_steps, freq_bins)"""
interpolate_ratio = 32
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
frames_num = x.shape[2]
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
if self.training:
x = self.time_shift(x)
x = self.spec_augmenter(x)
# Mixup on spectrogram
if self.training and mixup_lambda is not None:
x = do_mixup(x, mixup_lambda)
#print(x.shape) #torch.Size([10, 1, 1001, 64])
x = self.pvt_transformer(x)
#print(x.shape) #torch.Size([10, 800, 128])
x = torch.mean(x, dim=3)
x = x.transpose(1, 2).contiguous()
framewise_output = torch.sigmoid(self.fc_audioset(x))
#clipwise_output = torch.mean(framewise_output, dim=1)
#clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
x = framewise_output.transpose(1, 2).contiguous()
x = self.avgpool(x)
clipwise_output = torch.flatten(x, 1)
#print(framewise_output.shape) #torch.Size([10, 100, 17])
framewise_output = interpolate(framewise_output, interpolate_ratio)
#framewise_output = framewise_output[:,:1000,:]
#framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {'framewise_output': framewise_output,
'clipwise_output': clipwise_output}
return output_dict
class PVT2(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num):
super(PVT2, self).__init__()
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None
# Spectrogram extractor
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
freeze_parameters=True)
# Logmel feature extractor
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
freeze_parameters=True)
self.time_shift = TimeShift(0, 10)
# Spec augmenter
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
freq_drop_width=8, freq_stripes_num=2)
self.bn0 = nn.BatchNorm2d(64)
self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
fdim=64,
patch_size=7,
stride=4,
in_chans=1,
num_classes=classes_num,
embed_dims=[64, 128, 320, 512],
depths=[3, 4, 6, 3],
num_heads=[1, 2, 5, 8],
mlp_ratios=[8, 8, 4, 4],
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
drop_path_rate=0.1,
sr_ratios=[8, 4, 2, 1],
norm_layer=partial(nn.LayerNorm, eps=1e-6),
num_stages=4,
pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
)
#self.temp_pool = LinearSoftPool()
self.fc_audioset = nn.Linear(512, classes_num, bias=True)
self.init_weights()
def init_weights(self):
init_bn(self.bn0)
init_layer(self.fc_audioset)
def forward(self, input, mixup_lambda=None):
"""Input: (batch_size, times_steps, freq_bins)"""
interpolate_ratio = 32
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
frames_num = x.shape[2]
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
if self.training:
#x = self.time_shift(x)
x = self.spec_augmenter(x)
# Mixup on spectrogram
if self.training and mixup_lambda is not None:
x = do_mixup(x, mixup_lambda)
#print(x.shape) #torch.Size([10, 1, 1001, 64])
x = self.pvt_transformer(x)
#print(x.shape) #torch.Size([10, 800, 128])
x = torch.mean(x, dim=3)
x = x.transpose(1, 2).contiguous()
framewise_output = torch.sigmoid(self.fc_audioset(x))
clipwise_output = torch.mean(framewise_output, dim=1)
#clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
#print(framewise_output.shape) #torch.Size([10, 100, 17])
framewise_output = interpolate(framewise_output, interpolate_ratio)
#framewise_output = framewise_output[:,:1000,:]
#framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {'framewise_output': framewise_output,
'clipwise_output': clipwise_output}
return output_dict
class PVT_2layer(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num):
super(PVT_2layer, self).__init__()
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None
# Spectrogram extractor
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
freeze_parameters=True)
# Logmel feature extractor
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
freeze_parameters=True)
self.time_shift = TimeShift(0, 10)
# Spec augmenter
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
freq_drop_width=8, freq_stripes_num=2)
self.bn0 = nn.BatchNorm2d(64)
self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
fdim=64,
patch_size=7,
stride=4,
in_chans=1,
num_classes=classes_num,
embed_dims=[64, 128],
depths=[3, 4],
num_heads=[1, 2],
mlp_ratios=[8, 8],
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
drop_path_rate=0.1,
sr_ratios=[8, 4],
norm_layer=partial(nn.LayerNorm, eps=1e-6),
num_stages=2,
pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
)
#self.temp_pool = LinearSoftPool()
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.fc_audioset = nn.Linear(128, classes_num, bias=True)
self.init_weights()
def init_weights(self):
init_bn(self.bn0)
init_layer(self.fc_audioset)
def forward(self, input, mixup_lambda=None):
"""Input: (batch_size, times_steps, freq_bins)"""
interpolate_ratio = 8
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
frames_num = x.shape[2]
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
if self.training:
x = self.time_shift(x)
x = self.spec_augmenter(x)
# Mixup on spectrogram
if self.training and mixup_lambda is not None:
x = do_mixup(x, mixup_lambda)
#print(x.shape) #torch.Size([10, 1, 1001, 64])
x = self.pvt_transformer(x)
#print(x.shape) #torch.Size([10, 800, 128])
x = torch.mean(x, dim=3)
x = x.transpose(1, 2).contiguous()
framewise_output = torch.sigmoid(self.fc_audioset(x))
#clipwise_output = torch.mean(framewise_output, dim=1)
#clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
x = framewise_output.transpose(1, 2).contiguous()
x = self.avgpool(x)
clipwise_output = torch.flatten(x, 1)
#print(framewise_output.shape) #torch.Size([10, 100, 17])
framewise_output = interpolate(framewise_output, interpolate_ratio)
#framewise_output = framewise_output[:,:1000,:]
#framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {'framewise_output': framewise_output,
'clipwise_output': clipwise_output}
return output_dict
class PVT_lr(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num):
super(PVT_lr, self).__init__()
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None
# Spectrogram extractor
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
freeze_parameters=True)
# Logmel feature extractor
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
freeze_parameters=True)
self.time_shift = TimeShift(0, 10)
# Spec augmenter
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
freq_drop_width=8, freq_stripes_num=2)
self.bn0 = nn.BatchNorm2d(64)
self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
fdim=64,
patch_size=7,
stride=4,
in_chans=1,
num_classes=classes_num,
embed_dims=[64, 128, 320, 512],
depths=[3, 4, 6, 3],
num_heads=[1, 2, 5, 8],
mlp_ratios=[8, 8, 4, 4],
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
drop_path_rate=0.1,
sr_ratios=[8, 4, 2, 1],
norm_layer=partial(nn.LayerNorm, eps=1e-6),
num_stages=4,
pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
)
self.temp_pool = LinearSoftPool()
self.fc_audioset = nn.Linear(512, classes_num, bias=True)
self.init_weights()
def init_weights(self):
init_bn(self.bn0)
init_layer(self.fc_audioset)
def forward(self, input, mixup_lambda=None):
"""Input: (batch_size, times_steps, freq_bins)"""
interpolate_ratio = 32
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
frames_num = x.shape[2]
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
if self.training:
x = self.time_shift(x)
x = self.spec_augmenter(x)
# Mixup on spectrogram
if self.training and mixup_lambda is not None:
x = do_mixup(x, mixup_lambda)
#print(x.shape) #torch.Size([10, 1, 1001, 64])
x = self.pvt_transformer(x)
#print(x.shape) #torch.Size([10, 800, 128])
x = torch.mean(x, dim=3)
x = x.transpose(1, 2).contiguous()
framewise_output = torch.sigmoid(self.fc_audioset(x))
clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
#print(framewise_output.shape) #torch.Size([10, 100, 17])
framewise_output = interpolate(framewise_output, interpolate_ratio)
#framewise_output = framewise_output[:,:1000,:]
#framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {'framewise_output': framewise_output,
'clipwise_output': clipwise_output}
return output_dict
class PVT_nopretrain(nn.Module):
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
fmax, classes_num):
super(PVT_nopretrain, self).__init__()
window = 'hann'
center = True
pad_mode = 'reflect'
ref = 1.0
amin = 1e-10
top_db = None
# Spectrogram extractor
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
freeze_parameters=True)
# Logmel feature extractor
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
freeze_parameters=True)
self.time_shift = TimeShift(0, 10)
# Spec augmenter
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
freq_drop_width=8, freq_stripes_num=2)
self.bn0 = nn.BatchNorm2d(64)
self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
fdim=64,
patch_size=7,
stride=4,
in_chans=1,
num_classes=classes_num,
embed_dims=[64, 128, 320, 512],
depths=[3, 4, 6, 3],
num_heads=[1, 2, 5, 8],
mlp_ratios=[8, 8, 4, 4],
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
drop_path_rate=0.1,
sr_ratios=[8, 4, 2, 1],
norm_layer=partial(nn.LayerNorm, eps=1e-6),
num_stages=4,
#pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
)
self.temp_pool = LinearSoftPool()
self.fc_audioset = nn.Linear(512, classes_num, bias=True)
self.init_weights()
def init_weights(self):
init_bn(self.bn0)
init_layer(self.fc_audioset)
def forward(self, input, mixup_lambda=None):
"""Input: (batch_size, times_steps, freq_bins)"""
interpolate_ratio = 32
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
frames_num = x.shape[2]
x = x.transpose(1, 3)
x = self.bn0(x)
x = x.transpose(1, 3)
if self.training:
x = self.time_shift(x)
x = self.spec_augmenter(x)
# Mixup on spectrogram
if self.training and mixup_lambda is not None:
x = do_mixup(x, mixup_lambda)
#print(x.shape) #torch.Size([10, 1, 1001, 64])
x = self.pvt_transformer(x)
#print(x.shape) #torch.Size([10, 800, 128])
x = torch.mean(x, dim=3)
x = x.transpose(1, 2).contiguous()
framewise_output = torch.sigmoid(self.fc_audioset(x))
clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
#print(framewise_output.shape) #torch.Size([10, 100, 17])
framewise_output = interpolate(framewise_output, interpolate_ratio)
framewise_output = framewise_output[:,:1000,:]
#framewise_output = pad_framewise_output(framewise_output, frames_num)
output_dict = {'framewise_output': framewise_output,
'clipwise_output': clipwise_output}
return output_dict
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.dwconv = DWConv(hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
self.linear = linear
if self.linear:
self.relu = nn.ReLU()
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
def forward(self, x, H, W):
x = self.fc1(x)
if self.linear:
x = self.relu(x)
x = self.dwconv(x, H, W)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
super().__init__()
assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
self.dim = dim
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
self.q = nn.Linear(dim, dim, bias=qkv_bias)
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.linear = linear
self.sr_ratio = sr_ratio
if not linear:
if sr_ratio > 1:
self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
self.norm = nn.LayerNorm(dim)
else:
self.pool = nn.AdaptiveAvgPool2d(7)
self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
self.norm = nn.LayerNorm(dim)
self.act = nn.GELU()
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
def forward(self, x, H, W):
B, N, C = x.shape
q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
if not self.linear:
if self.sr_ratio > 1:
x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
x_ = self.norm(x_)
kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
else:
kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
else:
x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
x_ = self.norm(x_)
x_ = self.act(x_)
kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
k, v = kv[0], kv[1]
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Pooling(nn.Module):
"""
Implementation of pooling for PoolFormer
--pool_size: pooling size
"""
def __init__(self, pool_size=3):
super().__init__()
self.pool = nn.AvgPool2d(
pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
def forward(self, x):
return self.pool(x) - x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim,
num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
#self.norm3 = norm_layer(dim)
#self.token_mixer = Pooling(pool_size=3)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
def forward(self, x, H, W):
x = x + self.drop_path(self.attn(self.norm1(x), H, W))
x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
return x
class OverlapPatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, embed_dim=768):
super().__init__()
img_size = (tdim, fdim)
patch_size = to_2tuple(patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.H, self.W = img_size[0] // stride, img_size[1] // stride
self.num_patches = self.H * self.W
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
padding=(patch_size[0] // 3, patch_size[1] // 3))
self.norm = nn.LayerNorm(embed_dim)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
def forward(self, x):
x = self.proj(x)
_, _, H, W = x.shape
x = x.flatten(2).transpose(1, 2)
x = self.norm(x)
return x, H, W
class PyramidVisionTransformerV2(nn.Module):
def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
attn_drop_rate=0., drop_path_rate=0.1, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
sr_ratios=[8, 4, 2, 1], num_stages=2, linear=False, pretrained=None):
super().__init__()
# self.num_classes = num_classes
self.depths = depths
self.num_stages = num_stages
self.linear = linear
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
cur = 0
for i in range(num_stages):
patch_embed = OverlapPatchEmbed(tdim=tdim if i == 0 else tdim // (2 ** (i + 1)),
fdim=fdim if i == 0 else tdim // (2 ** (i + 1)),
patch_size=7 if i == 0 else 3,
stride=stride if i == 0 else 2,
in_chans=in_chans if i == 0 else embed_dims[i - 1],
embed_dim=embed_dims[i])
block = nn.ModuleList([Block(
dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
sr_ratio=sr_ratios[i], linear=linear)
for j in range(depths[i])])
norm = norm_layer(embed_dims[i])
cur += depths[i]
setattr(self, f"patch_embed{i + 1}", patch_embed)
setattr(self, f"block{i + 1}", block)
setattr(self, f"norm{i + 1}", norm)
#self.n = nn.Linear(125, 250, bias=True)
# classification head
# self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
self.apply(self._init_weights)
self.init_weights(pretrained)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
fan_out //= m.groups
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
if m.bias is not None:
m.bias.data.zero_()
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = get_root_logger()
load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
def freeze_patch_emb(self):
self.patch_embed1.requires_grad = False
@torch.jit.ignore
def no_weight_decay(self):
return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
def get_classifier(self):
return self.head
def reset_classifier(self, num_classes, global_pool=''):
self.num_classes = num_classes
self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
def forward_features(self, x):
B = x.shape[0]
for i in range(self.num_stages):
patch_embed = getattr(self, f"patch_embed{i + 1}")
block = getattr(self, f"block{i + 1}")
norm = getattr(self, f"norm{i + 1}")
x, H, W = patch_embed(x)
#print(x.shape)
for blk in block:
x = blk(x, H, W)
#print(x.shape)
x = norm(x)
#if i != self.num_stages - 1:
x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
#print(x.shape)
return x
def forward(self, x):
x = self.forward_features(x)
# x = self.head(x)
return x
class DWConv(nn.Module):
def __init__(self, dim=768):
super(DWConv, self).__init__()
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
def forward(self, x, H, W):
B, N, C = x.shape
x = x.transpose(1, 2).view(B, C, H, W)
x = self.dwconv(x)
x = x.flatten(2).transpose(1, 2)
return x
def _conv_filter(state_dict, patch_size=16):
""" convert patch embedding weight from manual patchify + linear proj to conv"""
out_dict = {}
for k, v in state_dict.items():
if 'patch_embed.proj.weight' in k:
v = v.reshape((v.shape[0], 3, patch_size, patch_size))
out_dict[k] = v
return out_dict

View File

@@ -0,0 +1,251 @@
import numpy as np
import time
import torch
import torch.nn as nn
def move_data_to_device(x, device):
if 'float' in str(x.dtype):
x = torch.Tensor(x)
elif 'int' in str(x.dtype):
x = torch.LongTensor(x)
else:
return x
return x.to(device)
def do_mixup(x, mixup_lambda):
"""Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes
(1, 3, 5, ...).
Args:
x: (batch_size * 2, ...)
mixup_lambda: (batch_size * 2,)
Returns:
out: (batch_size, ...)
"""
out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
return out
def append_to_dict(dict, key, value):
if key in dict.keys():
dict[key].append(value)
else:
dict[key] = [value]
def forward(model, generator, return_input=False,
return_target=False):
"""Forward data to a model.
Args:
model: object
generator: object
return_input: bool
return_target: bool
Returns:
audio_name: (audios_num,)
clipwise_output: (audios_num, classes_num)
(ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
(ifexist) framewise_output: (audios_num, frames_num, classes_num)
(optional) return_input: (audios_num, segment_samples)
(optional) return_target: (audios_num, classes_num)
"""
output_dict = {}
device = next(model.parameters()).device
time1 = time.time()
# Forward data to a model in mini-batches
for n, batch_data_dict in enumerate(generator):
print(n)
batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
with torch.no_grad():
model.eval()
batch_output = model(batch_waveform)
append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])
append_to_dict(output_dict, 'clipwise_output',
batch_output['clipwise_output'].data.cpu().numpy())
if 'segmentwise_output' in batch_output.keys():
append_to_dict(output_dict, 'segmentwise_output',
batch_output['segmentwise_output'].data.cpu().numpy())
if 'framewise_output' in batch_output.keys():
append_to_dict(output_dict, 'framewise_output',
batch_output['framewise_output'].data.cpu().numpy())
if return_input:
append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
if return_target:
if 'target' in batch_data_dict.keys():
append_to_dict(output_dict, 'target', batch_data_dict['target'])
if n % 10 == 0:
print(' --- Inference time: {:.3f} s / 10 iterations ---'.format(
time.time() - time1))
time1 = time.time()
for key in output_dict.keys():
output_dict[key] = np.concatenate(output_dict[key], axis=0)
return output_dict
def interpolate(x, ratio):
"""Interpolate data in time domain. This is used to compensate the
resolution reduction in downsampling of a CNN.
Args:
x: (batch_size, time_steps, classes_num)
ratio: int, ratio to interpolate
Returns:
upsampled: (batch_size, time_steps * ratio, classes_num)
"""
(batch_size, time_steps, classes_num) = x.shape
upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
return upsampled
def pad_framewise_output(framewise_output, frames_num):
"""Pad framewise_output to the same length as input frames. The pad value
is the same as the value of the last frame.
Args:
framewise_output: (batch_size, frames_num, classes_num)
frames_num: int, number of frames to pad
Outputs:
output: (batch_size, frames_num, classes_num)
"""
pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
"""tensor for padding"""
output = torch.cat((framewise_output, pad), dim=1)
"""(batch_size, frames_num, classes_num)"""
return output
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def count_flops(model, audio_length):
"""Count flops. Code modified from others' implementation.
"""
multiply_adds = True
list_conv2d=[]
def conv2d_hook(self, input, output):
batch_size, input_channels, input_height, input_width = input[0].size()
output_channels, output_height, output_width = output[0].size()
kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
bias_ops = 1 if self.bias is not None else 0
params = output_channels * (kernel_ops + bias_ops)
flops = batch_size * params * output_height * output_width
list_conv2d.append(flops)
list_conv1d=[]
def conv1d_hook(self, input, output):
batch_size, input_channels, input_length = input[0].size()
output_channels, output_length = output[0].size()
kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
bias_ops = 1 if self.bias is not None else 0
params = output_channels * (kernel_ops + bias_ops)
flops = batch_size * params * output_length
list_conv1d.append(flops)
list_linear=[]
def linear_hook(self, input, output):
batch_size = input[0].size(0) if input[0].dim() == 2 else 1
weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
bias_ops = self.bias.nelement()
flops = batch_size * (weight_ops + bias_ops)
list_linear.append(flops)
list_bn=[]
def bn_hook(self, input, output):
list_bn.append(input[0].nelement() * 2)
list_relu=[]
def relu_hook(self, input, output):
list_relu.append(input[0].nelement() * 2)
list_pooling2d=[]
def pooling2d_hook(self, input, output):
batch_size, input_channels, input_height, input_width = input[0].size()
output_channels, output_height, output_width = output[0].size()
kernel_ops = self.kernel_size * self.kernel_size
bias_ops = 0
params = output_channels * (kernel_ops + bias_ops)
flops = batch_size * params * output_height * output_width
list_pooling2d.append(flops)
list_pooling1d=[]
def pooling1d_hook(self, input, output):
batch_size, input_channels, input_length = input[0].size()
output_channels, output_length = output[0].size()
kernel_ops = self.kernel_size[0]
bias_ops = 0
params = output_channels * (kernel_ops + bias_ops)
flops = batch_size * params * output_length
list_pooling2d.append(flops)
def foo(net):
childrens = list(net.children())
if not childrens:
if isinstance(net, nn.Conv2d):
net.register_forward_hook(conv2d_hook)
elif isinstance(net, nn.Conv1d):
net.register_forward_hook(conv1d_hook)
elif isinstance(net, nn.Linear):
net.register_forward_hook(linear_hook)
elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
net.register_forward_hook(bn_hook)
elif isinstance(net, nn.ReLU):
net.register_forward_hook(relu_hook)
elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
net.register_forward_hook(pooling2d_hook)
elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
net.register_forward_hook(pooling1d_hook)
else:
print('Warning: flop of module {} is not counted!'.format(net))
return
for c in childrens:
foo(c)
# Register hook
foo(model)
device = device = next(model.parameters()).device
input = torch.rand(1, audio_length).to(device)
out = model(input)
total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
return total_flops

Binary file not shown.

After

Width:  |  Height:  |  Size: 230 KiB

View File

@@ -0,0 +1,94 @@
import numpy as np
import csv
sample_rate = 32000
clip_samples = sample_rate * 10 # Audio clips are 10-second
# Load label
with open('./audio_detection/audio_infer/metadata/class_labels_indices.csv', 'r') as f:
reader = csv.reader(f, delimiter=',')
lines = list(reader)
labels = []
ids = [] # Each label has a unique id such as "/m/068hy"
for i1 in range(1, len(lines)):
id = lines[i1][1]
label = lines[i1][2]
ids.append(id)
labels.append(label)
classes_num = len(labels)
lb_to_ix = {label : i for i, label in enumerate(labels)}
ix_to_lb = {i : label for i, label in enumerate(labels)}
id_to_ix = {id : i for i, id in enumerate(ids)}
ix_to_id = {i : id for i, id in enumerate(ids)}
full_samples_per_class = np.array([
937432, 16344, 7822, 10271, 2043, 14420, 733, 1511,
1258, 424, 1751, 704, 369, 590, 1063, 1375,
5026, 743, 853, 1648, 714, 1497, 1251, 2139,
1093, 133, 224, 39469, 6423, 407, 1559, 4546,
6826, 7464, 2468, 549, 4063, 334, 587, 238,
1766, 691, 114, 2153, 236, 209, 421, 740,
269, 959, 137, 4192, 485, 1515, 655, 274,
69, 157, 1128, 807, 1022, 346, 98, 680,
890, 352, 4169, 2061, 1753, 9883, 1339, 708,
37857, 18504, 12864, 2475, 2182, 757, 3624, 677,
1683, 3583, 444, 1780, 2364, 409, 4060, 3097,
3143, 502, 723, 600, 230, 852, 1498, 1865,
1879, 2429, 5498, 5430, 2139, 1761, 1051, 831,
2401, 2258, 1672, 1711, 987, 646, 794, 25061,
5792, 4256, 96, 8126, 2740, 752, 513, 554,
106, 254, 1592, 556, 331, 615, 2841, 737,
265, 1349, 358, 1731, 1115, 295, 1070, 972,
174, 937780, 112337, 42509, 49200, 11415, 6092, 13851,
2665, 1678, 13344, 2329, 1415, 2244, 1099, 5024,
9872, 10948, 4409, 2732, 1211, 1289, 4807, 5136,
1867, 16134, 14519, 3086, 19261, 6499, 4273, 2790,
8820, 1228, 1575, 4420, 3685, 2019, 664, 324,
513, 411, 436, 2997, 5162, 3806, 1389, 899,
8088, 7004, 1105, 3633, 2621, 9753, 1082, 26854,
3415, 4991, 2129, 5546, 4489, 2850, 1977, 1908,
1719, 1106, 1049, 152, 136, 802, 488, 592,
2081, 2712, 1665, 1128, 250, 544, 789, 2715,
8063, 7056, 2267, 8034, 6092, 3815, 1833, 3277,
8813, 2111, 4662, 2678, 2954, 5227, 1472, 2591,
3714, 1974, 1795, 4680, 3751, 6585, 2109, 36617,
6083, 16264, 17351, 3449, 5034, 3931, 2599, 4134,
3892, 2334, 2211, 4516, 2766, 2862, 3422, 1788,
2544, 2403, 2892, 4042, 3460, 1516, 1972, 1563,
1579, 2776, 1647, 4535, 3921, 1261, 6074, 2922,
3068, 1948, 4407, 712, 1294, 1019, 1572, 3764,
5218, 975, 1539, 6376, 1606, 6091, 1138, 1169,
7925, 3136, 1108, 2677, 2680, 1383, 3144, 2653,
1986, 1800, 1308, 1344, 122231, 12977, 2552, 2678,
7824, 768, 8587, 39503, 3474, 661, 430, 193,
1405, 1442, 3588, 6280, 10515, 785, 710, 305,
206, 4990, 5329, 3398, 1771, 3022, 6907, 1523,
8588, 12203, 666, 2113, 7916, 434, 1636, 5185,
1062, 664, 952, 3490, 2811, 2749, 2848, 15555,
363, 117, 1494, 1647, 5886, 4021, 633, 1013,
5951, 11343, 2324, 243, 372, 943, 734, 242,
3161, 122, 127, 201, 1654, 768, 134, 1467,
642, 1148, 2156, 1368, 1176, 302, 1909, 61,
223, 1812, 287, 422, 311, 228, 748, 230,
1876, 539, 1814, 737, 689, 1140, 591, 943,
353, 289, 198, 490, 7938, 1841, 850, 457,
814, 146, 551, 728, 1627, 620, 648, 1621,
2731, 535, 88, 1736, 736, 328, 293, 3170,
344, 384, 7640, 433, 215, 715, 626, 128,
3059, 1833, 2069, 3732, 1640, 1508, 836, 567,
2837, 1151, 2068, 695, 1494, 3173, 364, 88,
188, 740, 677, 273, 1533, 821, 1091, 293,
647, 318, 1202, 328, 532, 2847, 526, 721,
370, 258, 956, 1269, 1641, 339, 1322, 4485,
286, 1874, 277, 757, 1393, 1330, 380, 146,
377, 394, 318, 339, 1477, 1886, 101, 1435,
284, 1425, 686, 621, 221, 117, 87, 1340,
201, 1243, 1222, 651, 1899, 421, 712, 1016,
1279, 124, 351, 258, 7043, 368, 666, 162,
7664, 137, 70159, 26179, 6321, 32236, 33320, 771,
1169, 269, 1103, 444, 364, 2710, 121, 751,
1609, 855, 1141, 2287, 1940, 3943, 289])

View File

@@ -0,0 +1,12 @@
import sys
class ExceptionHook:
instance = None
def __call__(self, *args, **kwargs):
if self.instance is None:
from IPython.core import ultratb
self.instance = ultratb.FormattedTB(mode='Plain',
color_scheme='Linux', call_pdb=1)
return self.instance(*args, **kwargs)
sys.excepthook = ExceptionHook()

View File

@@ -0,0 +1,64 @@
import argparse
import csv
import os
from utilities import create_folder
def dcase2017task4(args):
"""Create black list. Black list is a list of audio ids that will be
skipped in training.
"""
# Augments & parameters
workspace = args.workspace
# Black list from DCASE 2017 Task 4
test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv'
evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv'
black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv')
create_folder(os.path.dirname(black_list_csv))
def get_id_sets(csv_path):
with open(csv_path, 'r') as fr:
reader = csv.reader(fr, delimiter='\t')
lines = list(reader)
ids_set = []
for line in lines:
"""line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']"""
ids_set.append(line[0][0 : 11])
ids_set = list(set(ids_set))
return ids_set
test_ids_set = get_id_sets(test_weak_csv)
evaluation_ids_set = get_id_sets(evaluation_weak_csv)
full_ids_set = test_ids_set + evaluation_ids_set
# Write black list
fw = open(black_list_csv, 'w')
for id in full_ids_set:
fw.write('{}\n'.format(id))
print('Write black list to {}'.format(black_list_csv))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
subparsers = parser.add_subparsers(dest='mode')
parser_dcase2017task4 = subparsers.add_parser('dcase2017task4')
parser_dcase2017task4.add_argument('--workspace', type=str, required=True)
args = parser.parse_args()
if args.mode == 'dcase2017task4':
dcase2017task4(args)
else:
raise Exception('Error argument!')

View File

@@ -0,0 +1,126 @@
import numpy as np
import argparse
import csv
import os
import glob
import datetime
import time
import logging
import h5py
import librosa
from utilities import create_folder, get_sub_filepaths
import config
def create_indexes(args):
"""Create indexes a for dataloader to read for training. When users have
a new task and their own data, they need to create similar indexes. The
indexes contain meta information of "where to find the data for training".
"""
# Arguments & parameters
waveforms_hdf5_path = args.waveforms_hdf5_path
indexes_hdf5_path = args.indexes_hdf5_path
# Paths
create_folder(os.path.dirname(indexes_hdf5_path))
with h5py.File(waveforms_hdf5_path, 'r') as hr:
with h5py.File(indexes_hdf5_path, 'w') as hw:
audios_num = len(hr['audio_name'])
hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20')
hw.create_dataset('target', data=hr['target'][:], dtype=np.bool)
hw.create_dataset('hdf5_path', data=[waveforms_hdf5_path.encode()] * audios_num, dtype='S200')
hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32)
print('Write to {}'.format(indexes_hdf5_path))
def combine_full_indexes(args):
"""Combine all balanced and unbalanced indexes hdf5s to a single hdf5. This
combined indexes hdf5 is used for training with full data (~20k balanced
audio clips + ~1.9m unbalanced audio clips).
"""
# Arguments & parameters
indexes_hdf5s_dir = args.indexes_hdf5s_dir
full_indexes_hdf5_path = args.full_indexes_hdf5_path
classes_num = config.classes_num
# Paths
paths = get_sub_filepaths(indexes_hdf5s_dir)
paths = [path for path in paths if (
'train' in path and 'full_train' not in path and 'mini' not in path)]
print('Total {} hdf5 to combine.'.format(len(paths)))
with h5py.File(full_indexes_hdf5_path, 'w') as full_hf:
full_hf.create_dataset(
name='audio_name',
shape=(0,),
maxshape=(None,),
dtype='S20')
full_hf.create_dataset(
name='target',
shape=(0, classes_num),
maxshape=(None, classes_num),
dtype=np.bool)
full_hf.create_dataset(
name='hdf5_path',
shape=(0,),
maxshape=(None,),
dtype='S200')
full_hf.create_dataset(
name='index_in_hdf5',
shape=(0,),
maxshape=(None,),
dtype=np.int32)
for path in paths:
with h5py.File(path, 'r') as part_hf:
print(path)
n = len(full_hf['audio_name'][:])
new_n = n + len(part_hf['audio_name'][:])
full_hf['audio_name'].resize((new_n,))
full_hf['audio_name'][n : new_n] = part_hf['audio_name'][:]
full_hf['target'].resize((new_n, classes_num))
full_hf['target'][n : new_n] = part_hf['target'][:]
full_hf['hdf5_path'].resize((new_n,))
full_hf['hdf5_path'][n : new_n] = part_hf['hdf5_path'][:]
full_hf['index_in_hdf5'].resize((new_n,))
full_hf['index_in_hdf5'][n : new_n] = part_hf['index_in_hdf5'][:]
print('Write combined full hdf5 to {}'.format(full_indexes_hdf5_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='mode')
parser_create_indexes = subparsers.add_parser('create_indexes')
parser_create_indexes.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path of packed waveforms hdf5.')
parser_create_indexes.add_argument('--indexes_hdf5_path', type=str, required=True, help='Path to write out indexes hdf5.')
parser_combine_full_indexes = subparsers.add_parser('combine_full_indexes')
parser_combine_full_indexes.add_argument('--indexes_hdf5s_dir', type=str, required=True, help='Directory containing indexes hdf5s to be combined.')
parser_combine_full_indexes.add_argument('--full_indexes_hdf5_path', type=str, required=True, help='Path to write out full indexes hdf5 file.')
args = parser.parse_args()
if args.mode == 'create_indexes':
create_indexes(args)
elif args.mode == 'combine_full_indexes':
combine_full_indexes(args)
else:
raise Exception('Incorrect arguments!')

View File

@@ -0,0 +1,421 @@
import numpy as np
import h5py
import csv
import time
import logging
from utilities import int16_to_float32
def read_black_list(black_list_csv):
"""Read audio names from black list.
"""
with open(black_list_csv, 'r') as fr:
reader = csv.reader(fr)
lines = list(reader)
black_list_names = ['Y{}.wav'.format(line[0]) for line in lines]
return black_list_names
class AudioSetDataset(object):
def __init__(self, sample_rate=32000):
"""This class takes the meta of an audio clip as input, and return
the waveform and target of the audio clip. This class is used by DataLoader.
"""
self.sample_rate = sample_rate
def __getitem__(self, meta):
"""Load waveform and target of an audio clip.
Args:
meta: {
'hdf5_path': str,
'index_in_hdf5': int}
Returns:
data_dict: {
'audio_name': str,
'waveform': (clip_samples,),
'target': (classes_num,)}
"""
hdf5_path = meta['hdf5_path']
index_in_hdf5 = meta['index_in_hdf5']
with h5py.File(hdf5_path, 'r') as hf:
audio_name = hf['audio_name'][index_in_hdf5].decode()
waveform = int16_to_float32(hf['waveform'][index_in_hdf5])
waveform = self.resample(waveform)
target = hf['target'][index_in_hdf5].astype(np.float32)
data_dict = {
'audio_name': audio_name, 'waveform': waveform, 'target': target}
return data_dict
def resample(self, waveform):
"""Resample.
Args:
waveform: (clip_samples,)
Returns:
(resampled_clip_samples,)
"""
if self.sample_rate == 32000:
return waveform
elif self.sample_rate == 16000:
return waveform[0 :: 2]
elif self.sample_rate == 8000:
return waveform[0 :: 4]
else:
raise Exception('Incorrect sample rate!')
class Base(object):
def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, random_seed):
"""Base class of train sampler.
Args:
indexes_hdf5_path: string
batch_size: int
black_list_csv: string
random_seed: int
"""
self.batch_size = batch_size
self.random_state = np.random.RandomState(random_seed)
# Black list
if black_list_csv:
self.black_list_names = read_black_list(black_list_csv)
else:
self.black_list_names = []
logging.info('Black list samples: {}'.format(len(self.black_list_names)))
# Load target
load_time = time.time()
with h5py.File(indexes_hdf5_path, 'r') as hf:
self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
self.targets = hf['target'][:].astype(np.float32)
(self.audios_num, self.classes_num) = self.targets.shape
logging.info('Training number: {}'.format(self.audios_num))
logging.info('Load target time: {:.3f} s'.format(time.time() - load_time))
class TrainSampler(Base):
def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
random_seed=1234):
"""Balanced sampler. Generate batch meta for training.
Args:
indexes_hdf5_path: string
batch_size: int
black_list_csv: string
random_seed: int
"""
super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size,
black_list_csv, random_seed)
self.indexes = np.arange(self.audios_num)
# Shuffle indexes
self.random_state.shuffle(self.indexes)
self.pointer = 0
def __iter__(self):
"""Generate batch meta for training.
Returns:
batch_meta: e.g.: [
{'hdf5_path': string, 'index_in_hdf5': int},
...]
"""
batch_size = self.batch_size
while True:
batch_meta = []
i = 0
while i < batch_size:
index = self.indexes[self.pointer]
self.pointer += 1
# Shuffle indexes and reset pointer
if self.pointer >= self.audios_num:
self.pointer = 0
self.random_state.shuffle(self.indexes)
# If audio in black list then continue
if self.audio_names[index] in self.black_list_names:
continue
else:
batch_meta.append({
'hdf5_path': self.hdf5_paths[index],
'index_in_hdf5': self.indexes_in_hdf5[index]})
i += 1
yield batch_meta
def state_dict(self):
state = {
'indexes': self.indexes,
'pointer': self.pointer}
return state
def load_state_dict(self, state):
self.indexes = state['indexes']
self.pointer = state['pointer']
class BalancedTrainSampler(Base):
def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
random_seed=1234):
"""Balanced sampler. Generate batch meta for training. Data are equally
sampled from different sound classes.
Args:
indexes_hdf5_path: string
batch_size: int
black_list_csv: string
random_seed: int
"""
super(BalancedTrainSampler, self).__init__(indexes_hdf5_path,
batch_size, black_list_csv, random_seed)
self.samples_num_per_class = np.sum(self.targets, axis=0)
logging.info('samples_num_per_class: {}'.format(
self.samples_num_per_class.astype(np.int32)))
# Training indexes of all sound classes. E.g.:
# [[0, 11, 12, ...], [3, 4, 15, 16, ...], [7, 8, ...], ...]
self.indexes_per_class = []
for k in range(self.classes_num):
self.indexes_per_class.append(
np.where(self.targets[:, k] == 1)[0])
# Shuffle indexes
for k in range(self.classes_num):
self.random_state.shuffle(self.indexes_per_class[k])
self.queue = []
self.pointers_of_classes = [0] * self.classes_num
def expand_queue(self, queue):
classes_set = np.arange(self.classes_num).tolist()
self.random_state.shuffle(classes_set)
queue += classes_set
return queue
def __iter__(self):
"""Generate batch meta for training.
Returns:
batch_meta: e.g.: [
{'hdf5_path': string, 'index_in_hdf5': int},
...]
"""
batch_size = self.batch_size
while True:
batch_meta = []
i = 0
while i < batch_size:
if len(self.queue) == 0:
self.queue = self.expand_queue(self.queue)
class_id = self.queue.pop(0)
pointer = self.pointers_of_classes[class_id]
self.pointers_of_classes[class_id] += 1
index = self.indexes_per_class[class_id][pointer]
# When finish one epoch of a sound class, then shuffle its indexes and reset pointer
if self.pointers_of_classes[class_id] >= self.samples_num_per_class[class_id]:
self.pointers_of_classes[class_id] = 0
self.random_state.shuffle(self.indexes_per_class[class_id])
# If audio in black list then continue
if self.audio_names[index] in self.black_list_names:
continue
else:
batch_meta.append({
'hdf5_path': self.hdf5_paths[index],
'index_in_hdf5': self.indexes_in_hdf5[index]})
i += 1
yield batch_meta
def state_dict(self):
state = {
'indexes_per_class': self.indexes_per_class,
'queue': self.queue,
'pointers_of_classes': self.pointers_of_classes}
return state
def load_state_dict(self, state):
self.indexes_per_class = state['indexes_per_class']
self.queue = state['queue']
self.pointers_of_classes = state['pointers_of_classes']
class AlternateTrainSampler(Base):
def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
random_seed=1234):
"""AlternateSampler is a combination of Sampler and Balanced Sampler.
AlternateSampler alternately sample data from Sampler and Blanced Sampler.
Args:
indexes_hdf5_path: string
batch_size: int
black_list_csv: string
random_seed: int
"""
self.sampler1 = TrainSampler(indexes_hdf5_path, batch_size,
black_list_csv, random_seed)
self.sampler2 = BalancedTrainSampler(indexes_hdf5_path, batch_size,
black_list_csv, random_seed)
self.batch_size = batch_size
self.count = 0
def __iter__(self):
"""Generate batch meta for training.
Returns:
batch_meta: e.g.: [
{'hdf5_path': string, 'index_in_hdf5': int},
...]
"""
batch_size = self.batch_size
while True:
self.count += 1
if self.count % 2 == 0:
batch_meta = []
i = 0
while i < batch_size:
index = self.sampler1.indexes[self.sampler1.pointer]
self.sampler1.pointer += 1
# Shuffle indexes and reset pointer
if self.sampler1.pointer >= self.sampler1.audios_num:
self.sampler1.pointer = 0
self.sampler1.random_state.shuffle(self.sampler1.indexes)
# If audio in black list then continue
if self.sampler1.audio_names[index] in self.sampler1.black_list_names:
continue
else:
batch_meta.append({
'hdf5_path': self.sampler1.hdf5_paths[index],
'index_in_hdf5': self.sampler1.indexes_in_hdf5[index]})
i += 1
elif self.count % 2 == 1:
batch_meta = []
i = 0
while i < batch_size:
if len(self.sampler2.queue) == 0:
self.sampler2.queue = self.sampler2.expand_queue(self.sampler2.queue)
class_id = self.sampler2.queue.pop(0)
pointer = self.sampler2.pointers_of_classes[class_id]
self.sampler2.pointers_of_classes[class_id] += 1
index = self.sampler2.indexes_per_class[class_id][pointer]
# When finish one epoch of a sound class, then shuffle its indexes and reset pointer
if self.sampler2.pointers_of_classes[class_id] >= self.sampler2.samples_num_per_class[class_id]:
self.sampler2.pointers_of_classes[class_id] = 0
self.sampler2.random_state.shuffle(self.sampler2.indexes_per_class[class_id])
# If audio in black list then continue
if self.sampler2.audio_names[index] in self.sampler2.black_list_names:
continue
else:
batch_meta.append({
'hdf5_path': self.sampler2.hdf5_paths[index],
'index_in_hdf5': self.sampler2.indexes_in_hdf5[index]})
i += 1
yield batch_meta
def state_dict(self):
state = {
'sampler1': self.sampler1.state_dict(),
'sampler2': self.sampler2.state_dict()}
return state
def load_state_dict(self, state):
self.sampler1.load_state_dict(state['sampler1'])
self.sampler2.load_state_dict(state['sampler2'])
class EvaluateSampler(object):
def __init__(self, indexes_hdf5_path, batch_size):
"""Evaluate sampler. Generate batch meta for evaluation.
Args:
indexes_hdf5_path: string
batch_size: int
"""
self.batch_size = batch_size
with h5py.File(indexes_hdf5_path, 'r') as hf:
self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
self.targets = hf['target'][:].astype(np.float32)
self.audios_num = len(self.audio_names)
def __iter__(self):
"""Generate batch meta for training.
Returns:
batch_meta: e.g.: [
{'hdf5_path': string,
'index_in_hdf5': int}
...]
"""
batch_size = self.batch_size
pointer = 0
while pointer < self.audios_num:
batch_indexes = np.arange(pointer,
min(pointer + batch_size, self.audios_num))
batch_meta = []
for index in batch_indexes:
batch_meta.append({
'audio_name': self.audio_names[index],
'hdf5_path': self.hdf5_paths[index],
'index_in_hdf5': self.indexes_in_hdf5[index],
'target': self.targets[index]})
pointer += batch_size
yield batch_meta
def collate_fn(list_data_dict):
"""Collate data.
Args:
list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...},
{'audio_name': str, 'waveform': (clip_samples,), ...},
...]
Returns:
np_data_dict, dict, e.g.,
{'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...}
"""
np_data_dict = {}
for key in list_data_dict[0].keys():
np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict])
return np_data_dict

View File

@@ -0,0 +1,224 @@
import numpy as np
import argparse
import csv
import os
import glob
import datetime
import time
import logging
import h5py
import librosa
from utilities import (create_folder, get_filename, create_logging,
float32_to_int16, pad_or_truncate, read_metadata)
import config
def split_unbalanced_csv_to_partial_csvs(args):
"""Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids.
"""
unbalanced_csv_path = args.unbalanced_csv
unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir
create_folder(unbalanced_partial_csvs_dir)
with open(unbalanced_csv_path, 'r') as f:
lines = f.readlines()
lines = lines[3:] # Remove head info
audios_num_per_file = 50000
files_num = int(np.ceil(len(lines) / float(audios_num_per_file)))
for r in range(files_num):
lines_per_file = lines[r * audios_num_per_file :
(r + 1) * audios_num_per_file]
out_csv_path = os.path.join(unbalanced_partial_csvs_dir,
'unbalanced_train_segments_part{:02d}.csv'.format(r))
with open(out_csv_path, 'w') as f:
f.write('empty\n')
f.write('empty\n')
f.write('empty\n')
for line in lines_per_file:
f.write(line)
print('Write out csv to {}'.format(out_csv_path))
def download_wavs(args):
"""Download videos and extract audio in wav format.
"""
# Paths
csv_path = args.csv_path
audios_dir = args.audios_dir
mini_data = args.mini_data
if mini_data:
logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path))
else:
logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path))
create_folder(audios_dir)
create_folder(logs_dir)
create_logging(logs_dir, filemode='w')
logging.info('Download log is saved to {}'.format(logs_dir))
# Read csv
with open(csv_path, 'r') as f:
lines = f.readlines()
lines = lines[3:] # Remove csv head info
if mini_data:
lines = lines[0 : 10] # Download partial data for debug
download_time = time.time()
# Download
for (n, line) in enumerate(lines):
items = line.split(', ')
audio_id = items[0]
start_time = float(items[1])
end_time = float(items[2])
duration = end_time - start_time
logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format(
n, audio_id, start_time, end_time))
# Download full video of whatever format
video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id))
os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\
.format(video_name, audio_id))
video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*'))
# If download successful
if len(video_paths) > 0:
video_path = video_paths[0] # Choose one video
# Add 'Y' to the head because some video ids are started with '-'
# which will cause problem
audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav')
# Extract audio in wav format
os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\
.format(video_path,
str(datetime.timedelta(seconds=start_time)), duration,
audio_path))
# Remove downloaded video
os.system("rm {}".format(video_path))
logging.info("Download and convert to {}".format(audio_path))
logging.info('Download finished! Time spent: {:.3f} s'.format(
time.time() - download_time))
logging.info('Logs can be viewed in {}'.format(logs_dir))
def pack_waveforms_to_hdf5(args):
"""Pack waveform and target of several audio clips to a single hdf5 file.
This can speed up loading and training.
"""
# Arguments & parameters
audios_dir = args.audios_dir
csv_path = args.csv_path
waveforms_hdf5_path = args.waveforms_hdf5_path
mini_data = args.mini_data
clip_samples = config.clip_samples
classes_num = config.classes_num
sample_rate = config.sample_rate
id_to_ix = config.id_to_ix
# Paths
if mini_data:
prefix = 'mini_'
waveforms_hdf5_path += '.mini'
else:
prefix = ''
create_folder(os.path.dirname(waveforms_hdf5_path))
logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path))
create_folder(logs_dir)
create_logging(logs_dir, filemode='w')
logging.info('Write logs to {}'.format(logs_dir))
# Read csv file
meta_dict = read_metadata(csv_path, classes_num, id_to_ix)
if mini_data:
mini_num = 10
for key in meta_dict.keys():
meta_dict[key] = meta_dict[key][0 : mini_num]
audios_num = len(meta_dict['audio_name'])
# Pack waveform to hdf5
total_time = time.time()
with h5py.File(waveforms_hdf5_path, 'w') as hf:
hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16)
hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool)
hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)
# Pack waveform & target of several audio clips to a single hdf5 file
for n in range(audios_num):
audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])
if os.path.isfile(audio_path):
logging.info('{} {}'.format(n, audio_path))
(audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
audio = pad_or_truncate(audio, clip_samples)
hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
hf['waveform'][n] = float32_to_int16(audio)
hf['target'][n] = meta_dict['target'][n]
else:
logging.info('{} File does not exist! {}'.format(n, audio_path))
logging.info('Write to {}'.format(waveforms_hdf5_path))
logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(dest='mode')
parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs')
parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.')
parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.')
parser_download_wavs = subparsers.add_parser('download_wavs')
parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.')
parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5')
parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.')
parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.')
args = parser.parse_args()
if args.mode == 'split_unbalanced_csv_to_partial_csvs':
split_unbalanced_csv_to_partial_csvs(args)
elif args.mode == 'download_wavs':
download_wavs(args)
elif args.mode == 'pack_waveforms_to_hdf5':
pack_waveforms_to_hdf5(args)
else:
raise Exception('Incorrect arguments!')

View File

@@ -0,0 +1,565 @@
import os
import sys
import numpy as np
import argparse
import h5py
import time
import pickle
import matplotlib.pyplot as plt
import csv
from sklearn import metrics
from utilities import (create_folder, get_filename, d_prime)
import config
def load_statistics(statistics_path):
statistics_dict = pickle.load(open(statistics_path, 'rb'))
bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']]) # (N, classes_num)
bal_map = np.mean(bal_map, axis=-1)
test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']]) # (N, classes_num)
test_map = np.mean(test_map, axis=-1)
return bal_map, test_map
def crop_label(label):
max_len = 16
if len(label) <= max_len:
return label
else:
words = label.split(' ')
cropped_label = ''
for w in words:
if len(cropped_label + ' ' + w) > max_len:
break
else:
cropped_label += ' {}'.format(w)
return cropped_label
def add_comma(integer):
"""E.g., 1234567 -> 1,234,567
"""
integer = int(integer)
if integer >= 1000:
return str(integer // 1000) + ',' + str(integer % 1000)
else:
return str(integer)
def plot_classwise_iteration_map(args):
# Paths
save_out_path = 'results/classwise_iteration_map.pdf'
create_folder(os.path.dirname(save_out_path))
# Load statistics
statistics_dict = pickle.load(open('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl', 'rb'))
mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
mAP_mat = mAP_mat[0 : 300, :] # 300 * 2000 = 600k iterations
sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
fig, axs = plt.subplots(1, 3, figsize=(20, 5))
ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
axs[0].set_ylabel('AP')
for col in range(0, 3):
axs[col].set_ylim(0, 1.)
axs[col].set_xlim(0, 301)
axs[col].set_xlabel('Iterations')
axs[col].set_ylabel('AP')
axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
lines = []
for _ix in ranges[col]:
_label = crop_label(config.labels[sorted_indexes[_ix]]) + \
' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
lines.append(line)
box = axs[col].get_position()
axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
plt.tight_layout(pad=4, w_pad=1, h_pad=1)
plt.savefig(save_out_path)
print(save_out_path)
def plot_six_figures(args):
# Arguments & parameters
classes_num = config.classes_num
labels = config.labels
max_plot_iteration = 540000
iterations = np.arange(0, max_plot_iteration, 2000)
# Paths
class_labels_indices_path = os.path.join('metadata', 'class_labels_indices.csv')
save_out_path = 'results/six_figures.pdf'
create_folder(os.path.dirname(save_out_path))
# Plot
fig, ax = plt.subplots(2, 3, figsize=(14, 7))
bal_alpha = 0.3
test_alpha = 1.0
linewidth = 1.
# (a) Comparison of architectures
if True:
lines = []
# Wavegram-Logmel-CNN
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl')
line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Cnn14
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# MobileNetV1
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_MobileNetV1_balanced_mixup_bs32.pkl')
line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
ax[0, 0].legend(handles=lines, loc=2)
ax[0, 0].set_title('(a) Comparison of architectures')
# (b) Comparison of training data and augmentation'
if True:
lines = []
# Full data + balanced sampler + mixup
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Full data + balanced sampler + mixup in time domain
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_timedomain_bs32.pkl')
line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Full data + balanced sampler + no mixup
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_nomixup_bs32.pkl')
line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Full data + uniform sampler + no mixup
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_nobalanced_nomixup_bs32.pkl')
line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Balanced data + balanced sampler + mixup
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Balanced data + balanced sampler + no mixup
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_nomixup_bs32.pkl')
line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
ax[0, 1].set_title('(b) Comparison of training data and augmentation')
# (c) Comparison of embedding size
if True:
lines = []
# Embedding size 2048
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Embedding size 128
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb128_balanced_mixup_bs32.pkl')
line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Embedding size 32
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb32_balanced_mixup_bs32.pkl')
line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
ax[0, 2].legend(handles=lines, loc=2)
ax[0, 2].set_title('(c) Comparison of embedding size')
# (d) Comparison of amount of training data
if True:
lines = []
# 100% of full training data
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# 80% of full training data
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.8full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# 50% of full training data
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.5full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
ax[1, 0].legend(handles=lines, loc=2)
ax[1, 0].set_title('(d) Comparison of amount of training data')
# (e) Comparison of sampling rate
if True:
lines = []
# Cnn14 + 32 kHz
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Cnn14 + 16 kHz
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_16k_balanced_mixup_bs32.pkl')
line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Cnn14 + 8 kHz
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_8k_balanced_mixup_bs32.pkl')
line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
ax[1, 1].legend(handles=lines, loc=2)
ax[1, 1].set_title('(e) Comparison of sampling rate')
# (f) Comparison of mel bins number
if True:
lines = []
# Cnn14 + 128 mel bins
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel128_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Cnn14 + 64 mel bins
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
# Cnn14 + 32 mel bins
(bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel32_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
lines.append(line)
ax[1, 2].legend(handles=lines, loc=2)
ax[1, 2].set_title('(f) Comparison of mel bins number')
for i in range(2):
for j in range(3):
ax[i, j].set_ylim(0, 0.8)
ax[i, j].set_xlim(0, len(iterations))
ax[i, j].set_xlabel('Iterations')
ax[i, j].set_ylabel('mAP')
ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3',
'', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
plt.tight_layout(0, 1, 0)
plt.savefig(save_out_path)
print('Save figure to {}'.format(save_out_path))
def plot_complexity_map(args):
# Paths
save_out_path = 'results/complexity_mAP.pdf'
create_folder(os.path.dirname(save_out_path))
plt.figure(figsize=(5, 5))
fig, ax = plt.subplots(1, 1)
model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54',
'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18',
'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
flops = np.array([21.986, 28.166, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810,
30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295,
0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
sorted_indexes = np.sort(flops)
ax.scatter(flops, mAPs)
shift = [[-5.5, -0.004], [1, -0.004], [-1, -0.014], [-2, 0.006], [-7, 0.006],
[1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008],
[1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
for i, model_type in enumerate(model_types):
ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
ax.plot(flops[[6, 7]], mAPs[[6, 7]])
ax.plot(flops[[9, 10]], mAPs[[9, 10]])
ax.plot(flops[[11, 12]], mAPs[[11, 12]])
ax.plot(flops[[13, 14]], mAPs[[13, 14]])
ax.set_xlim(0, 70)
ax.set_ylim(0.2, 0.5)
ax.set_xlabel('Multi-load_statisticss (million)', fontsize=15)
ax.set_ylabel('mAP', fontsize=15)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)
plt.tight_layout(0, 0, 0)
plt.savefig(save_out_path)
print('Write out figure to {}'.format(save_out_path))
def plot_long_fig(args):
# Paths
stats = pickle.load(open('paper_statistics/stats_for_long_fig.pkl', 'rb'))
save_out_path = 'results/long_fig.pdf'
create_folder(os.path.dirname(save_out_path))
# Load meta
N = len(config.labels)
sorted_indexes = stats['sorted_indexes_for_plot']
sorted_labels = np.array(config.labels)[sorted_indexes]
audio_clips_per_class = stats['official_balanced_training_samples'] + stats['official_unbalanced_training_samples']
audio_clips_per_class = audio_clips_per_class[sorted_indexes]
# Prepare axes for plot
(ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
# plot the number of training samples
ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
# Load mAP of different systems
"""Average instance system of [1] with an mAP of 0.317.
[1] Kong, Qiuqiang, Changsong Yu, Yong Xu, Turab Iqbal, Wenwu Wang, and
Mark D. Plumbley. "Weakly labelled audioset tagging with attention neural
networks." IEEE/ACM Transactions on Audio, Speech, and Language Processing
27, no. 11 (2019): 1791-1802."""
maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
maps_avg_instances = maps_avg_instances[sorted_indexes]
# PANNs Cnn14
maps_panns_cnn14 = stats['panns_cnn14']['eval']['average_precision']
maps_panns_cnn14 = maps_panns_cnn14[sorted_indexes]
# PANNs MobileNetV1
maps_panns_mobilenetv1 = stats['panns_mobilenetv1']['eval']['average_precision']
maps_panns_mobilenetv1 = maps_panns_mobilenetv1[sorted_indexes]
# PANNs Wavegram-Logmel-Cnn14
maps_panns_wavegram_logmel_cnn14 = stats['panns_wavegram_logmel_cnn14']['eval']['average_precision']
maps_panns_wavegram_logmel_cnn14 = maps_panns_wavegram_logmel_cnn14[sorted_indexes]
# Plot mAPs
_scatter_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
_scatter_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
_scatter_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
_scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
linewidth = 0.7
line0te = _plot_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b,
c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
line1te = _plot_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, c='r',
linewidth=linewidth, label='AP with CNN14')
line2te = _plot_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b',
linewidth=linewidth, label='AP with MobileNetV1')
line3te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k',
linewidth=linewidth, label='AP with averaging instances (baseline)')
# Plot label quality
label_quality = stats['label_quality']
sorted_label_quality = np.array(label_quality)[sorted_indexes]
for k in range(len(sorted_label_quality)):
if sorted_label_quality[k] and sorted_label_quality[k] == 1:
sorted_label_quality[k] = 0.99
ax1b.scatter(np.arange(N)[sorted_label_quality != None],
sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
ax2b.scatter(np.arange(N)[sorted_label_quality != None],
sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
ax3b.scatter(np.arange(N)[sorted_label_quality != None],
sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
line_label_quality = ax4b.scatter(np.arange(N)[sorted_label_quality != None],
sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
ax1b.scatter(np.arange(N)[sorted_label_quality == None],
0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
ax2b.scatter(np.arange(N)[sorted_label_quality == None],
0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
ax3b.scatter(np.arange(N)[sorted_label_quality == None],
0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
ax4b.scatter(np.arange(N)[sorted_label_quality == None],
0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
plt.tight_layout(0, 0, 0)
plt.savefig(save_out_path)
print('Save fig to {}'.format(save_out_path))
def prepare_plot_long_4_rows(sorted_lbs):
N = len(sorted_lbs)
f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1, sharey=False, facecolor='w', figsize=(10, 10.5))
fontsize = 5
K = 132
ax1a.set_xlim(0, K)
ax2a.set_xlim(K, 2 * K)
ax3a.set_xlim(2 * K, 3 * K)
ax4a.set_xlim(3 * K, N)
truncated_sorted_lbs = []
for lb in sorted_lbs:
lb = lb[0 : 25]
words = lb.split(' ')
if len(words[-1]) < 3:
lb = ' '.join(words[0:-1])
truncated_sorted_lbs.append(lb)
ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
ax1a.set_yscale('log')
ax2a.set_yscale('log')
ax3a.set_yscale('log')
ax4a.set_yscale('log')
ax1b = ax1a.twinx()
ax2b = ax2a.twinx()
ax3b = ax3a.twinx()
ax4b = ax4a.twinx()
ax1b.set_ylim(0., 1.)
ax2b.set_ylim(0., 1.)
ax3b.set_ylim(0., 1.)
ax4b.set_ylim(0., 1.)
ax1b.set_ylabel('Average precision')
ax2b.set_ylabel('Average precision')
ax3b.set_ylabel('Average precision')
ax4b.set_ylabel('Average precision')
ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
ax1a.xaxis.set_ticks(np.arange(K))
ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
ax1a.xaxis.tick_bottom()
ax1a.set_ylabel("Number of audio clips")
ax2a.xaxis.set_ticks(np.arange(K, 2*K))
ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
ax2a.xaxis.tick_bottom()
ax2a.set_ylabel("Number of audio clips")
ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
ax3a.xaxis.tick_bottom()
ax3a.set_ylabel("Number of audio clips")
ax4a.xaxis.set_ticks(np.arange(3*K, N))
ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
ax4a.xaxis.tick_bottom()
ax4a.set_ylabel("Number of audio clips")
ax1a.spines['right'].set_visible(False)
ax1b.spines['right'].set_visible(False)
ax2a.spines['left'].set_visible(False)
ax2b.spines['left'].set_visible(False)
ax2a.spines['right'].set_visible(False)
ax2b.spines['right'].set_visible(False)
ax3a.spines['left'].set_visible(False)
ax3b.spines['left'].set_visible(False)
ax3a.spines['right'].set_visible(False)
ax3b.spines['right'].set_visible(False)
ax4a.spines['left'].set_visible(False)
ax4b.spines['left'].set_visible(False)
plt.subplots_adjust(hspace = 0.8)
return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
N = len(x)
ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
N = len(x)
ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
return line
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
subparsers = parser.add_subparsers(dest='mode')
parser_classwise_iteration_map = subparsers.add_parser('plot_classwise_iteration_map')
parser_six_figures = subparsers.add_parser('plot_six_figures')
parser_complexity_map = subparsers.add_parser('plot_complexity_map')
parser_long_fig = subparsers.add_parser('plot_long_fig')
args = parser.parse_args()
if args.mode == 'plot_classwise_iteration_map':
plot_classwise_iteration_map(args)
elif args.mode == 'plot_six_figures':
plot_six_figures(args)
elif args.mode == 'plot_complexity_map':
plot_complexity_map(args)
elif args.mode == 'plot_long_fig':
plot_long_fig(args)
else:
raise Exception('Incorrect argument!')

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,172 @@
import os
import logging
import h5py
import soundfile
import librosa
import numpy as np
import pandas as pd
from scipy import stats
import datetime
import pickle
def create_folder(fd):
if not os.path.exists(fd):
os.makedirs(fd)
def get_filename(path):
path = os.path.realpath(path)
na_ext = path.split('/')[-1]
na = os.path.splitext(na_ext)[0]
return na
def get_sub_filepaths(folder):
paths = []
for root, dirs, files in os.walk(folder):
for name in files:
path = os.path.join(root, name)
paths.append(path)
return paths
def create_logging(log_dir, filemode):
create_folder(log_dir)
i1 = 0
while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
i1 += 1
log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename=log_path,
filemode=filemode)
# Print to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
return logging
def read_metadata(csv_path, classes_num, id_to_ix):
"""Read metadata of AudioSet from a csv file.
Args:
csv_path: str
Returns:
meta_dict: {'audio_name': (audios_num,), 'target': (audios_num, classes_num)}
"""
with open(csv_path, 'r') as fr:
lines = fr.readlines()
lines = lines[3:] # Remove heads
audios_num = len(lines)
targets = np.zeros((audios_num, classes_num), dtype=np.bool)
audio_names = []
for n, line in enumerate(lines):
items = line.split(', ')
"""items: ['--4gqARaEJE', '0.000', '10.000', '"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"\n']"""
audio_name = 'Y{}.wav'.format(items[0]) # Audios are started with an extra 'Y' when downloading
label_ids = items[3].split('"')[1].split(',')
audio_names.append(audio_name)
# Target
for id in label_ids:
ix = id_to_ix[id]
targets[n, ix] = 1
meta_dict = {'audio_name': np.array(audio_names), 'target': targets}
return meta_dict
def float32_to_int16(x):
assert np.max(np.abs(x)) <= 1.2
x = np.clip(x, -1, 1)
return (x * 32767.).astype(np.int16)
def int16_to_float32(x):
return (x / 32767.).astype(np.float32)
def pad_or_truncate(x, audio_length):
"""Pad all audio to specific length."""
if len(x) <= audio_length:
return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
else:
return x[0 : audio_length]
def d_prime(auc):
d_prime = stats.norm().ppf(auc) * np.sqrt(2.0)
return d_prime
class Mixup(object):
def __init__(self, mixup_alpha, random_seed=1234):
"""Mixup coefficient generator.
"""
self.mixup_alpha = mixup_alpha
self.random_state = np.random.RandomState(random_seed)
def get_lambda(self, batch_size):
"""Get mixup random coefficients.
Args:
batch_size: int
Returns:
mixup_lambdas: (batch_size,)
"""
mixup_lambdas = []
for n in range(0, batch_size, 2):
lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
mixup_lambdas.append(lam)
mixup_lambdas.append(1. - lam)
return np.array(mixup_lambdas)
class StatisticsContainer(object):
def __init__(self, statistics_path):
"""Contain statistics of different training iterations.
"""
self.statistics_path = statistics_path
self.backup_statistics_path = '{}_{}.pkl'.format(
os.path.splitext(self.statistics_path)[0],
datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
self.statistics_dict = {'bal': [], 'test': []}
def append(self, iteration, statistics, data_type):
statistics['iteration'] = iteration
self.statistics_dict[data_type].append(statistics)
def dump(self):
pickle.dump(self.statistics_dict, open(self.statistics_path, 'wb'))
pickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb'))
logging.info(' Dump statistics to {}'.format(self.statistics_path))
logging.info(' Dump statistics to {}'.format(self.backup_statistics_path))
def load_state_dict(self, resume_iteration):
self.statistics_dict = pickle.load(open(self.statistics_path, 'rb'))
resume_statistics_dict = {'bal': [], 'test': []}
for key in self.statistics_dict.keys():
for statistics in self.statistics_dict[key]:
if statistics['iteration'] <= resume_iteration:
resume_statistics_dict[key].append(statistics)
self.statistics_dict = resume_statistics_dict

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,353 @@
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/3/9 16:33
# @Author : dongchao yang
# @File : train.py
import collections
import sys
from loguru import logger
from pprint import pformat
import numpy as np
import pandas as pd
import scipy
import six
import sklearn.preprocessing as pre
import torch
import tqdm
import yaml
from scipy.interpolate import interp1d
def parse_config_or_kwargs(config_file, **kwargs):
"""parse_config_or_kwargs
:param config_file: Config file that has parameters, yaml format
:param **kwargs: Other alternative parameters or overwrites for config
"""
with open(config_file) as con_read:
yaml_config = yaml.load(con_read, Loader=yaml.FullLoader)
arguments = dict(yaml_config, **kwargs)
return arguments
def find_contiguous_regions(activity_array): # in this part, if you cannot understand the binary operation, I think you can write a O(n) complexity method
"""Find contiguous regions from bool valued numpy.array.
Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder
Reason is:
1. This does not belong to a class necessarily
2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters
"""
change_indices = np.logical_xor(activity_array[1:], activity_array[:-1]).nonzero()[0]
change_indices += 1
if activity_array[0]:
# If the first element of activity_array is True add 0 at the beginning
change_indices = np.r_[0, change_indices]
if activity_array[-1]:
# If the last element of activity_array is True, add the length of the array
change_indices = np.r_[change_indices, activity_array.size]
# print(change_indices.reshape((-1, 2)))
# Reshape the result into two columns
return change_indices.reshape((-1, 2))
def split_train_cv(
data_frame: pd.DataFrame,
frac: float = 0.9,
y=None, # Only for stratified, computes necessary split
**kwargs):
"""split_train_cv
:param data_frame:
:type data_frame: pd.DataFrame
:param frac:
:type frac: float
"""
if kwargs.get('mode',
None) == 'urbansed': # Filenames are DATA_-1 DATA_-2 etc
data_frame.loc[:, 'id'] = data_frame.groupby(
data_frame['filename'].str.split('_').apply(
lambda x: '_'.join(x[:-1]))).ngroup()
sampler = np.random.permutation(data_frame['id'].nunique())
num_train = int(frac * len(sampler))
train_indexes = sampler[:num_train]
cv_indexes = sampler[num_train:]
train_data = data_frame[data_frame['id'].isin(train_indexes)]
cv_data = data_frame[data_frame['id'].isin(cv_indexes)]
del train_data['id']
del cv_data['id']
elif kwargs.get('mode', None) == 'stratified': # stratified --> 分层的 ?
# Use statified sampling
from skmultilearn.model_selection import iterative_train_test_split
index_train, _, index_cv, _ = iterative_train_test_split(
data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac)
train_data = data_frame[data_frame.index.isin(index_train.squeeze())]
cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] # cv --> cross validation
else:
# Simply split train_test
train_data = data_frame.sample(frac=frac, random_state=10)
cv_data = data_frame[~data_frame.index.isin(train_data.index)]
return train_data, cv_data
def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): # print yaml file
"""pprint_dict
:param outputfun: function to use, defaults to sys.stdout
:param in_dict: dict to print
"""
if formatter == 'yaml':
format_fun = yaml.dump
elif formatter == 'pretty':
format_fun = pformat
for line in format_fun(in_dict).split('\n'):
outputfun(line)
def getfile_outlogger(outputfile):
log_format = "[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] {message}"
logger.configure(handlers=[{"sink": sys.stderr, "format": log_format}])
if outputfile:
logger.add(outputfile, enqueue=True, format=log_format)
return logger
# according label, get encoder
def train_labelencoder(labels: pd.Series, sparse=True):
"""encode_labels
Encodes labels
:param labels: pd.Series representing the raw labels e.g., Speech, Water
:param encoder (optional): Encoder already fitted
returns encoded labels (many hot) and the encoder
"""
assert isinstance(labels, pd.Series), "Labels need to be series"
if isinstance(labels[0], six.string_types):
# In case of using non processed strings, e.g., Vaccum, Speech
label_array = labels.str.split(',').values.tolist() # split label according to ','
elif isinstance(labels[0], np.ndarray):
# Encoder does not like to see numpy array
label_array = [lab.tolist() for lab in labels]
elif isinstance(labels[0], collections.Iterable):
label_array = labels
encoder = pre.MultiLabelBinarizer(sparse_output=sparse)
encoder.fit(label_array)
return encoder
def encode_labels(labels: pd.Series, encoder=None, sparse=True):
"""encode_labels
Encodes labels
:param labels: pd.Series representing the raw labels e.g., Speech, Water
:param encoder (optional): Encoder already fitted
returns encoded labels (many hot) and the encoder
"""
assert isinstance(labels, pd.Series), "Labels need to be series"
instance = labels.iloc[0]
if isinstance(instance, six.string_types):
# In case of using non processed strings, e.g., Vaccum, Speech
label_array = labels.str.split(',').values.tolist()
elif isinstance(instance, np.ndarray):
# Encoder does not like to see numpy array
label_array = [lab.tolist() for lab in labels]
elif isinstance(instance, collections.Iterable):
label_array = labels
# get label_array, it is a list ,contain a lot of label, this label are string type
if not encoder:
encoder = pre.MultiLabelBinarizer(sparse_output=sparse) # if we encoder is None, we should init a encoder firstly.
encoder.fit(label_array)
labels_encoded = encoder.transform(label_array) # transform string to digit
return labels_encoded, encoder
# return pd.arrays.SparseArray(
# [row.toarray().ravel() for row in labels_encoded]), encoder
def decode_with_timestamps(events,labels: np.array):
"""decode_with_timestamps
Decodes the predicted label array (2d) into a list of
[(Labelname, onset, offset), ...]
:param encoder: Encoder during training
:type encoder: pre.MultiLabelBinarizer
:param labels: n-dim array
:type labels: np.array
"""
# print('events ',events)
# print('labels ',labels.shape)
#assert 1==2
if labels.ndim == 2:
#print('...')
return [_decode_with_timestamps(events[i],labels[i]) for i in range(labels.shape[0])]
else:
return _decode_with_timestamps(events,labels)
def median_filter(x, window_size, threshold=0.5):
"""median_filter
:param x: input prediction array of shape (B, T, C) or (B, T).
Input is a sequence of probabilities 0 <= x <= 1
:param window_size: An integer to use
:param threshold: Binary thresholding threshold
"""
x = binarize(x, threshold=threshold) # transfer to 0 or 1
if x.ndim == 3:
size = (1, window_size, 1)
elif x.ndim == 2 and x.shape[0] == 1:
# Assume input is class-specific median filtering
# E.g, Batch x Time [1, 501]
size = (1, window_size)
elif x.ndim == 2 and x.shape[0] > 1:
# Assume input is standard median pooling, class-independent
# E.g., Time x Class [501, 10]
size = (window_size, 1)
return scipy.ndimage.median_filter(x, size=size)
def _decode_with_timestamps(events,labels):
result_labels = []
# print('.......')
# print('labels ',labels.shape)
# print(labels)
change_indices = find_contiguous_regions(labels)
# print(change_indices)
# assert 1==2
for row in change_indices:
result_labels.append((events,row[0], row[1]))
return result_labels
def inverse_transform_labels(encoder, pred):
if pred.ndim == 3:
return [encoder.inverse_transform(x) for x in pred]
else:
return encoder.inverse_transform(pred)
def binarize(pred, threshold=0.5):
# Batch_wise
if pred.ndim == 3:
return np.array(
[pre.binarize(sub, threshold=threshold) for sub in pred])
else:
return pre.binarize(pred, threshold=threshold)
def double_threshold(x, high_thres, low_thres, n_connect=1):
"""double_threshold
Helper function to calculate double threshold for n-dim arrays
:param x: input array
:param high_thres: high threshold value
:param low_thres: Low threshold value
:param n_connect: Distance of <= n clusters will be merged
"""
assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format(
x.shape)
if x.ndim == 3:
apply_dim = 1
elif x.ndim < 3:
apply_dim = 0
# x is assumed to be 3d: (batch, time, dim)
# Assumed to be 2d : (time, dim)
# Assumed to be 1d : (time)
# time axis is therefore at 1 for 3d and 0 for 2d (
return np.apply_along_axis(lambda x: _double_threshold(
x, high_thres, low_thres, n_connect=n_connect),
axis=apply_dim,
arr=x)
def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True): # in nature, double_threshold considers boundary question
"""_double_threshold
Computes a double threshold over the input array
:param x: input array, needs to be 1d
:param high_thres: High threshold over the array
:param low_thres: Low threshold over the array
:param n_connect: Postprocessing, maximal distance between clusters to connect
:param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros.
"""
assert x.ndim == 1, "Input needs to be 1d"
high_locations = np.where(x > high_thres)[0] # return the index, where value is greater than high_thres
locations = x > low_thres # return true of false
encoded_pairs = find_contiguous_regions(locations)
# print('encoded_pairs ',encoded_pairs)
filtered_list = list(
filter(
lambda pair:
((pair[0] <= high_locations) & (high_locations <= pair[1])).any(),
encoded_pairs)) # find encoded_pair where inclide a high_lacations
#print('filtered_list ',filtered_list)
filtered_list = connect_(filtered_list, n_connect) # if the distance of two pair is less than n_connect, we can merge them
if return_arr:
zero_one_arr = np.zeros_like(x, dtype=int)
for sl in filtered_list:
zero_one_arr[sl[0]:sl[1]] = 1
return zero_one_arr
return filtered_list
def connect_clusters(x, n=1):
if x.ndim == 1:
return connect_clusters_(x, n)
if x.ndim >= 2:
return np.apply_along_axis(lambda a: connect_clusters_(a, n=n), -2, x)
def connect_clusters_(x, n=1):
"""connect_clusters_
Connects clustered predictions (0,1) in x with range n
:param x: Input array. zero-one format
:param n: Number of frames to skip until connection can be made
"""
assert x.ndim == 1, "input needs to be 1d"
reg = find_contiguous_regions(x)
start_end = connect_(reg, n=n)
zero_one_arr = np.zeros_like(x, dtype=int)
for sl in start_end:
zero_one_arr[sl[0]:sl[1]] = 1
return zero_one_arr
def connect_(pairs, n=1):
"""connect_
Connects two adjacent clusters if their distance is <= n
:param pairs: Clusters of iterateables e.g., [(1,5),(7,10)]
:param n: distance between two clusters
"""
if len(pairs) == 0:
return []
start_, end_ = pairs[0]
new_pairs = []
for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])):
end_ = next_item[1]
if next_item[0] - cur_item[1] <= n:
pass
else:
new_pairs.append((start_, cur_item[1]))
start_ = next_item[0]
new_pairs.append((start_, end_))
return new_pairs
def predictions_to_time(df, ratio):
df.onset = df.onset * ratio
df.offset = df.offset * ratio
return df
def upgrade_resolution(arr, scale):
print('arr ',arr.shape)
x = np.arange(0, arr.shape[0])
f = interp1d(x, arr, kind='linear', axis=0, fill_value='extrapolate')
scale_x = np.arange(0, arr.shape[0], 1 / scale)
up_scale = f(scale_x)
return up_scale
# a = [0.1,0.2,0.3,0.8,0.4,0.1,0.3,0.9,0.4]
# a = np.array(a)
# b = a>0.2
# _double_threshold(a,0.7,0.2)

View File

@@ -31,4 +31,17 @@ wget -P text_to_speech/checkpoints/ljspeech/ps_adv_baseline -i https://huggingfa
# Audio to text
wget -P audio_to_text/audiocaps_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
wget -P audio_to_text/clotho_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
# audio detection
cd audio_detection/audio_infer/useful_ckpts
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/audio_detection.pth
cd mono2binaural/useful_ckpts
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/m2b.tar.gz
tar -zxvf m2b.tar.gz ./
rm m2b.tar.gz
cd audio_detection/target_sound_detection/useful_ckpts
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/tsd.tar.gz
tar -zxvf tsd.tar.gz ./
rm tsd.tar.gz
cd sound_extraction/useful_ckpts
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/LASSNet.pt

110
mono2binaural/src/models.py Normal file
View File

@@ -0,0 +1,110 @@
import numpy as np
import scipy.linalg
from scipy.spatial.transform import Rotation as R
import torch as th
import torch.nn as nn
import torch.nn.functional as F
from src.warping import GeometricTimeWarper, MonotoneTimeWarper
from src.utils import Net
class GeometricWarper(nn.Module):
def __init__(self, sampling_rate=48000):
super().__init__()
self.warper = GeometricTimeWarper(sampling_rate=sampling_rate)
def _transmitter_mouth(self, view):
# offset between tracking markers and real mouth position in the dataset
mouth_offset = np.array([0.09, 0, -0.20])
quat = view[:, 3:, :].transpose(2, 1).contiguous().detach().cpu().view(-1, 4).numpy()
# make sure zero-padded values are set to non-zero values (else scipy raises an exception)
norms = scipy.linalg.norm(quat, axis=1)
eps_val = (norms == 0).astype(np.float32)
quat = quat + eps_val[:, None]
transmitter_rot_mat = R.from_quat(quat)
transmitter_mouth = transmitter_rot_mat.apply(mouth_offset, inverse=True)
transmitter_mouth = th.Tensor(transmitter_mouth).view(view.shape[0], -1, 3).transpose(2, 1).contiguous()
if view.is_cuda:
transmitter_mouth = transmitter_mouth.cuda()
return transmitter_mouth
def _3d_displacements(self, view):
transmitter_mouth = self._transmitter_mouth(view)
# offset between tracking markers and ears in the dataset
left_ear_offset = th.Tensor([0, -0.08, -0.22]).cuda() if view.is_cuda else th.Tensor([0, -0.08, -0.22])
right_ear_offset = th.Tensor([0, 0.08, -0.22]).cuda() if view.is_cuda else th.Tensor([0, 0.08, -0.22])
# compute displacements between transmitter mouth and receiver left/right ear
displacement_left = view[:, 0:3, :] + transmitter_mouth - left_ear_offset[None, :, None]
displacement_right = view[:, 0:3, :] + transmitter_mouth - right_ear_offset[None, :, None]
displacement = th.stack([displacement_left, displacement_right], dim=1)
return displacement
def _warpfield(self, view, seq_length):
return self.warper.displacements2warpfield(self._3d_displacements(view), seq_length)
def forward(self, mono, view):
'''
:param mono: input signal as tensor of shape B x 1 x T
:param view: rx/tx position/orientation as tensor of shape B x 7 x K (K = T / 400)
:return: warped: warped left/right ear signal as tensor of shape B x 2 x T
'''
return self.warper(th.cat([mono, mono], dim=1), self._3d_displacements(view))
class Warpnet(nn.Module):
def __init__(self, layers=4, channels=64, view_dim=7):
super().__init__()
self.layers = [nn.Conv1d(view_dim if l == 0 else channels, channels, kernel_size=2) for l in range(layers)]
self.layers = nn.ModuleList(self.layers)
self.linear = nn.Conv1d(channels, 2, kernel_size=1)
self.neural_warper = MonotoneTimeWarper()
self.geometric_warper = GeometricWarper()
def neural_warpfield(self, view, seq_length):
warpfield = view
for layer in self.layers:
warpfield = F.pad(warpfield, pad=[1, 0])
warpfield = F.relu(layer(warpfield))
warpfield = self.linear(warpfield)
warpfield = F.interpolate(warpfield, size=seq_length)
return warpfield
def forward(self, mono, view):
'''
:param mono: input signal as tensor of shape B x 1 x T
:param view: rx/tx position/orientation as tensor of shape B x 7 x K (K = T / 400)
:return: warped: warped left/right ear signal as tensor of shape B x 2 x T
'''
geometric_warpfield = self.geometric_warper._warpfield(view, mono.shape[-1])
neural_warpfield = self.neural_warpfield(view, mono.shape[-1])
warpfield = geometric_warpfield + neural_warpfield
# ensure causality
warpfield = -F.relu(-warpfield) # the predicted warp
warped = self.neural_warper(th.cat([mono, mono], dim=1), warpfield)
return warped
class BinauralNetwork(Net):
def __init__(self,
view_dim=7,
warpnet_layers=4,
warpnet_channels=64,
model_name='binaural_network',
use_cuda=True):
super().__init__(model_name, use_cuda)
self.warper = Warpnet(warpnet_layers, warpnet_channels)
if self.use_cuda:
self.cuda()
def forward(self, mono, view):
'''
:param mono: the input signal as a B x 1 x T tensor
:param view: the receiver/transmitter position as a B x 7 x T tensor
:return: out: the binaural output produced by the network
intermediate: a two-channel audio signal obtained from the output of each intermediate layer
as a list of B x 2 x T tensors
'''
# print('mono ', mono.shape)
# print('view ', view.shape)
warped = self.warper(mono, view)
# print('warped ', warped.shape)
return warped

251
mono2binaural/src/utils.py Normal file
View File

@@ -0,0 +1,251 @@
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import numpy as np
import torch as th
#import torchaudio as ta
class Net(th.nn.Module):
def __init__(self, model_name="network", use_cuda=True):
super().__init__()
self.use_cuda = use_cuda
self.model_name = model_name
def save(self, model_dir, suffix=''):
'''
save the network to model_dir/model_name.suffix.net
:param model_dir: directory to save the model to
:param suffix: suffix to append after model name
'''
if self.use_cuda:
self.cpu()
if suffix == "":
fname = f"{model_dir}/{self.model_name}.net"
else:
fname = f"{model_dir}/{self.model_name}.{suffix}.net"
th.save(self.state_dict(), fname)
if self.use_cuda:
self.cuda()
def load_from_file(self, model_file):
'''
load network parameters from model_file
:param model_file: file containing the model parameters
'''
if self.use_cuda:
self.cpu()
states = th.load(model_file)
self.load_state_dict(states)
if self.use_cuda:
self.cuda()
print(f"Loaded: {model_file}")
def load(self, model_dir, suffix=''):
'''
load network parameters from model_dir/model_name.suffix.net
:param model_dir: directory to load the model from
:param suffix: suffix to append after model name
'''
if suffix == "":
fname = f"{model_dir}/{self.model_name}.net"
else:
fname = f"{model_dir}/{self.model_name}.{suffix}.net"
self.load_from_file(fname)
def num_trainable_parameters(self):
'''
:return: the number of trainable parameters in the model
'''
return sum(p.numel() for p in self.parameters() if p.requires_grad)
# class NewbobAdam(th.optim.Adam):
# def __init__(self,
# weights,
# net,
# artifacts_dir,
# initial_learning_rate=0.001,
# decay=0.5,
# max_decay=0.01
# ):
# '''
# Newbob learning rate scheduler
# :param weights: weights to optimize
# :param net: the network, must be an instance of type src.utils.Net
# :param artifacts_dir: (str) directory to save/restore models to/from
# :param initial_learning_rate: (float) initial learning rate
# :param decay: (float) value to decrease learning rate by when loss doesn't improve further
# :param max_decay: (float) maximum decay of learning rate
# '''
# super().__init__(weights, lr=initial_learning_rate)
# self.last_epoch_loss = np.inf
# self.total_decay = 1
# self.net = net
# self.decay = decay
# self.max_decay = max_decay
# self.artifacts_dir = artifacts_dir
# # store initial state as backup
# if decay < 1.0:
# net.save(artifacts_dir, suffix="newbob")
# def update_lr(self, loss):
# '''
# update the learning rate based on the current loss value and historic loss values
# :param loss: the loss after the current iteration
# '''
# if loss > self.last_epoch_loss and self.decay < 1.0 and self.total_decay > self.max_decay:
# self.total_decay = self.total_decay * self.decay
# print(f"NewbobAdam: Decay learning rate (loss degraded from {self.last_epoch_loss} to {loss})."
# f"Total decay: {self.total_decay}")
# # restore previous network state
# self.net.load(self.artifacts_dir, suffix="newbob")
# # decrease learning rate
# for param_group in self.param_groups:
# param_group['lr'] = param_group['lr'] * self.decay
# else:
# self.last_epoch_loss = loss
# # save last snapshot to restore it in case of lr decrease
# if self.decay < 1.0 and self.total_decay > self.max_decay:
# self.net.save(self.artifacts_dir, suffix="newbob")
# class FourierTransform:
# def __init__(self,
# fft_bins=2048,
# win_length_ms=40,
# frame_rate_hz=100,
# causal=False,
# preemphasis=0.0,
# sample_rate=48000,
# normalized=False):
# self.sample_rate = sample_rate
# self.frame_rate_hz = frame_rate_hz
# self.preemphasis = preemphasis
# self.fft_bins = fft_bins
# self.win_length = int(sample_rate * win_length_ms / 1000)
# self.hop_length = int(sample_rate / frame_rate_hz)
# self.causal = causal
# self.normalized = normalized
# if self.win_length > self.fft_bins:
# print('FourierTransform Warning: fft_bins should be larger than win_length')
# def _convert_format(self, data, expected_dims):
# if not type(data) == th.Tensor:
# data = th.Tensor(data)
# if len(data.shape) < expected_dims:
# data = data.unsqueeze(0)
# if not len(data.shape) == expected_dims:
# raise Exception(f"FourierTransform: data needs to be a Tensor with {expected_dims} dimensions but got shape {data.shape}")
# return data
# def _preemphasis(self, audio):
# if self.preemphasis > 0:
# return th.cat((audio[:, 0:1], audio[:, 1:] - self.preemphasis * audio[:, :-1]), dim=1)
# return audio
# def _revert_preemphasis(self, audio):
# if self.preemphasis > 0:
# for i in range(1, audio.shape[1]):
# audio[:, i] = audio[:, i] + self.preemphasis * audio[:, i-1]
# return audio
# def _magphase(self, complex_stft):
# mag, phase = ta.functional.magphase(complex_stft, 1.0)
# return mag, phase
# def stft(self, audio):
# '''
# wrapper around th.stft
# audio: wave signal as th.Tensor
# '''
# hann = th.hann_window(self.win_length)
# hann = hann.cuda() if audio.is_cuda else hann
# spec = th.stft(audio, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length,
# window=hann, center=not self.causal, normalized=self.normalized)
# return spec.contiguous()
# def complex_spectrogram(self, audio):
# '''
# audio: wave signal as th.Tensor
# return: th.Tensor of size channels x frequencies x time_steps (channels x y_axis x x_axis)
# '''
# self._convert_format(audio, expected_dims=2)
# audio = self._preemphasis(audio)
# return self.stft(audio)
# def magnitude_phase(self, audio):
# '''
# audio: wave signal as th.Tensor
# return: tuple containing two th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
# '''
# stft = self.complex_spectrogram(audio)
# return self._magphase(stft)
# def mag_spectrogram(self, audio):
# '''
# audio: wave signal as th.Tensor
# return: magnitude spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
# '''
# return self.magnitude_phase(audio)[0]
# def power_spectrogram(self, audio):
# '''
# audio: wave signal as th.Tensor
# return: power spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
# '''
# return th.pow(self.mag_spectrogram(audio), 2.0)
# def phase_spectrogram(self, audio):
# '''
# audio: wave signal as th.Tensor
# return: phase spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
# '''
# return self.magnitude_phase(audio)[1]
# def mel_spectrogram(self, audio, n_mels):
# '''
# audio: wave signal as th.Tensor
# n_mels: number of bins used for mel scale warping
# return: mel spectrogram as th.Tensor of size channels x n_mels x time_steps for magnitude and phase spectrum
# '''
# spec = self.power_spectrogram(audio)
# mel_warping = ta.transforms.MelScale(n_mels, self.sample_rate)
# return mel_warping(spec)
# def complex_spec2wav(self, complex_spec, length):
# '''
# inverse stft
# complex_spec: complex spectrum as th.Tensor of size channels x frequencies x time_steps x 2 (real part/imaginary part)
# length: length of the audio to be reconstructed (in frames)
# '''
# complex_spec = self._convert_format(complex_spec, expected_dims=4)
# hann = th.hann_window(self.win_length)
# hann = hann.cuda() if complex_spec.is_cuda else hann
# wav = ta.functional.istft(complex_spec, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length, window=hann, length=length, center=not self.causal)
# wav = self._revert_preemphasis(wav)
# return wav
# def magphase2wav(self, mag_spec, phase_spec, length):
# '''
# reconstruction of wav signal from magnitude and phase spectrum
# mag_spec: magnitude spectrum as th.Tensor of size channels x frequencies x time_steps
# phase_spec: phase spectrum as th.Tensor of size channels x frequencies x time_steps
# length: length of the audio to be reconstructed (in frames)
# '''
# mag_spec = self._convert_format(mag_spec, expected_dims=3)
# phase_spec = self._convert_format(phase_spec, expected_dims=3)
# complex_spec = th.stack([mag_spec * th.cos(phase_spec), mag_spec * th.sin(phase_spec)], dim=-1)
# return self.complex_spec2wav(complex_spec, length)

View File

@@ -0,0 +1,113 @@
"""
Copyright (c) Facebook, Inc. and its affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
"""
import torch as th
import torch.nn as nn
import torch.nn.functional as F
class TimeWarperFunction(th.autograd.Function):
@staticmethod
def forward(ctx, input, warpfield):
'''
:param ctx: autograd context
:param input: input signal (B x 2 x T)
:param warpfield: the corresponding warpfield (B x 2 x T)
:return: the warped signal (B x 2 x T)
'''
ctx.save_for_backward(input, warpfield)
# compute index list to lookup warped input values
idx_left = warpfield.floor().type(th.long)
idx_right = th.clamp(warpfield.ceil().type(th.long), max=input.shape[-1]-1)
# compute weight for linear interpolation
alpha = warpfield - warpfield.floor()
# linear interpolation
output = (1 - alpha) * th.gather(input, 2, idx_left) + alpha * th.gather(input, 2, idx_right)
return output
@staticmethod
def backward(ctx, grad_output):
input, warpfield = ctx.saved_tensors
# compute index list to lookup warped input values
idx_left = warpfield.floor().type(th.long)
idx_right = th.clamp(warpfield.ceil().type(th.long), max=input.shape[-1]-1)
# warpfield gradient
grad_warpfield = th.gather(input, 2, idx_right) - th.gather(input, 2, idx_left)
grad_warpfield = grad_output * grad_warpfield
# input gradient
grad_input = th.zeros(input.shape, device=input.device)
alpha = warpfield - warpfield.floor()
grad_input = grad_input.scatter_add(2, idx_left, grad_output * (1 - alpha)) + \
grad_input.scatter_add(2, idx_right, grad_output * alpha)
return grad_input, grad_warpfield
class TimeWarper(nn.Module):
def __init__(self):
super().__init__()
self.warper = TimeWarperFunction().apply
def _to_absolute_positions(self, warpfield, seq_length):
# translate warpfield from relative warp indices to absolute indices ([1...T] + warpfield)
temp_range = th.arange(seq_length, dtype=th.float)
temp_range = temp_range.cuda() if warpfield.is_cuda else temp_range
return th.clamp(warpfield + temp_range[None, None, :], min=0, max=seq_length-1)
def forward(self, input, warpfield):
'''
:param input: audio signal to be warped (B x 2 x T)
:param warpfield: the corresponding warpfield (B x 2 x T)
:return: the warped signal (B x 2 x T)
'''
warpfield = self._to_absolute_positions(warpfield, input.shape[-1])
warped = self.warper(input, warpfield)
return warped
class MonotoneTimeWarper(TimeWarper):
def forward(self, input, warpfield):
'''
:param input: audio signal to be warped (B x 2 x T)
:param warpfield: the corresponding warpfield (B x 2 x T)
:return: the warped signal (B x 2 x T), ensured to be monotonous
'''
warpfield = self._to_absolute_positions(warpfield, input.shape[-1])
# ensure monotonicity: each warp must be at least as big as previous_warp-1
warpfield = th.cummax(warpfield, dim=-1)[0]
# print('warpfield ',warpfield.shape)
# warp
warped = self.warper(input, warpfield)
return warped
class GeometricTimeWarper(TimeWarper):
def __init__(self, sampling_rate=48000):
super().__init__()
self.sampling_rate = sampling_rate
def displacements2warpfield(self, displacements, seq_length):
distance = th.sum(displacements**2, dim=2) ** 0.5
distance = F.interpolate(distance, size=seq_length)
warpfield = -distance / 343.0 * self.sampling_rate
return warpfield
def forward(self, input, displacements):
'''
:param input: audio signal to be warped (B x 2 x T)
:param displacements: sequence of 3D displacement vectors for geometric warping (B x 3 x T)
:return: the warped signal (B x 2 x T)
'''
warpfield = self.displacements2warpfield(displacements, input.shape[-1])
# print('Ge warpfield ', warpfield.shape)
# assert 1==2
warped = super().forward(input, warpfield)
return warped

View File

@@ -0,0 +1,25 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from .text_encoder import Text_Encoder
from .resunet_film import UNetRes_FiLM
class LASSNet(nn.Module):
def __init__(self, device='cuda'):
super(LASSNet, self).__init__()
self.text_embedder = Text_Encoder(device)
self.UNet = UNetRes_FiLM(channels=1, cond_embedding_dim=256)
def forward(self, x, caption):
# x: (Batch, 1, T, 128))
input_ids, attns_mask = self.text_embedder.tokenize(caption)
cond_vec = self.text_embedder(input_ids, attns_mask)[0]
dec_cond_vec = cond_vec
mask = self.UNet(x, cond_vec, dec_cond_vec)
mask = torch.sigmoid(mask)
return mask
def get_tokenizer(self):
return self.text_embedder.tokenizer

View File

@@ -0,0 +1,27 @@
import torch
import torch.nn as nn
class Film(nn.Module):
def __init__(self, channels, cond_embedding_dim):
super(Film, self).__init__()
self.linear = nn.Sequential(
nn.Linear(cond_embedding_dim, channels * 2),
nn.ReLU(inplace=True),
nn.Linear(channels * 2, channels),
nn.ReLU(inplace=True)
)
def forward(self, data, cond_vec):
"""
:param data: [batchsize, channels, samples] or [batchsize, channels, T, F] or [batchsize, channels, F, T]
:param cond_vec: [batchsize, cond_embedding_dim]
:return:
"""
bias = self.linear(cond_vec) # [batchsize, channels]
if len(list(data.size())) == 3:
data = data + bias[..., None]
elif len(list(data.size())) == 4:
data = data + bias[..., None, None]
else:
print("Warning: The size of input tensor,", data.size(), "is not correct. Film is not working.")
return data

View File

@@ -0,0 +1,483 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from .film import Film
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, activation, momentum):
super(ConvBlock, self).__init__()
self.activation = activation
padding = (kernel_size[0] // 2, kernel_size[1] // 2)
self.conv1 = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=(1, 1),
dilation=(1, 1),
padding=padding,
bias=False,
)
self.bn1 = nn.BatchNorm2d(out_channels, momentum=momentum)
self.conv2 = nn.Conv2d(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=(1, 1),
dilation=(1, 1),
padding=padding,
bias=False,
)
self.bn2 = nn.BatchNorm2d(out_channels, momentum=momentum)
self.init_weights()
def init_weights(self):
init_layer(self.conv1)
init_layer(self.conv2)
init_bn(self.bn1)
init_bn(self.bn2)
def forward(self, x):
x = act(self.bn1(self.conv1(x)), self.activation)
x = act(self.bn2(self.conv2(x)), self.activation)
return x
class EncoderBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, downsample, activation, momentum):
super(EncoderBlock, self).__init__()
self.conv_block = ConvBlock(
in_channels, out_channels, kernel_size, activation, momentum
)
self.downsample = downsample
def forward(self, x):
encoder = self.conv_block(x)
encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
return encoder_pool, encoder
class DecoderBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, upsample, activation, momentum):
super(DecoderBlock, self).__init__()
self.kernel_size = kernel_size
self.stride = upsample
self.activation = activation
self.conv1 = torch.nn.ConvTranspose2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=self.stride,
stride=self.stride,
padding=(0, 0),
bias=False,
dilation=(1, 1),
)
self.bn1 = nn.BatchNorm2d(out_channels, momentum=momentum)
self.conv_block2 = ConvBlock(
out_channels * 2, out_channels, kernel_size, activation, momentum
)
def init_weights(self):
init_layer(self.conv1)
init_bn(self.bn)
def prune(self, x):
"""Prune the shape of x after transpose convolution."""
padding = (self.kernel_size[0] // 2, self.kernel_size[1] // 2)
x = x[
:,
:,
padding[0] : padding[0] - self.stride[0],
padding[1] : padding[1] - self.stride[1]]
return x
def forward(self, input_tensor, concat_tensor):
x = act(self.bn1(self.conv1(input_tensor)), self.activation)
# from IPython import embed; embed(using=False); os._exit(0)
# x = self.prune(x)
x = torch.cat((x, concat_tensor), dim=1)
x = self.conv_block2(x)
return x
class EncoderBlockRes1B(nn.Module):
def __init__(self, in_channels, out_channels, downsample, activation, momentum):
super(EncoderBlockRes1B, self).__init__()
size = (3,3)
self.conv_block1 = ConvBlockRes(in_channels, out_channels, size, activation, momentum)
self.conv_block2 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.downsample = downsample
def forward(self, x):
encoder = self.conv_block1(x)
encoder = self.conv_block2(encoder)
encoder = self.conv_block3(encoder)
encoder = self.conv_block4(encoder)
encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
return encoder_pool, encoder
class DecoderBlockRes1B(nn.Module):
def __init__(self, in_channels, out_channels, stride, activation, momentum):
super(DecoderBlockRes1B, self).__init__()
size = (3,3)
self.activation = activation
self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
out_channels=out_channels, kernel_size=size, stride=stride,
padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
self.bn1 = nn.BatchNorm2d(in_channels)
self.conv_block2 = ConvBlockRes(out_channels * 2, out_channels, size, activation, momentum)
self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block5 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
def init_weights(self):
init_layer(self.conv1)
def prune(self, x, both=False):
"""Prune the shape of x after transpose convolution.
"""
if(both): x = x[:, :, 0 : - 1, 0:-1]
else: x = x[:, :, 0: - 1, :]
return x
def forward(self, input_tensor, concat_tensor,both=False):
x = self.conv1(F.relu_(self.bn1(input_tensor)))
x = self.prune(x,both=both)
x = torch.cat((x, concat_tensor), dim=1)
x = self.conv_block2(x)
x = self.conv_block3(x)
x = self.conv_block4(x)
x = self.conv_block5(x)
return x
class EncoderBlockRes2BCond(nn.Module):
def __init__(self, in_channels, out_channels, downsample, activation, momentum, cond_embedding_dim):
super(EncoderBlockRes2BCond, self).__init__()
size = (3, 3)
self.conv_block1 = ConvBlockResCond(in_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block2 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.downsample = downsample
def forward(self, x, cond_vec):
encoder = self.conv_block1(x, cond_vec)
encoder = self.conv_block2(encoder, cond_vec)
encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
return encoder_pool, encoder
class DecoderBlockRes2BCond(nn.Module):
def __init__(self, in_channels, out_channels, stride, activation, momentum, cond_embedding_dim):
super(DecoderBlockRes2BCond, self).__init__()
size = (3, 3)
self.activation = activation
self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
out_channels=out_channels, kernel_size=size, stride=stride,
padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
self.bn1 = nn.BatchNorm2d(in_channels)
self.conv_block2 = ConvBlockResCond(out_channels * 2, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block3 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
def init_weights(self):
init_layer(self.conv1)
def prune(self, x, both=False):
"""Prune the shape of x after transpose convolution.
"""
if(both): x = x[:, :, 0 : - 1, 0:-1]
else: x = x[:, :, 0: - 1, :]
return x
def forward(self, input_tensor, concat_tensor, cond_vec, both=False):
x = self.conv1(F.relu_(self.bn1(input_tensor)))
x = self.prune(x, both=both)
x = torch.cat((x, concat_tensor), dim=1)
x = self.conv_block2(x, cond_vec)
x = self.conv_block3(x, cond_vec)
return x
class EncoderBlockRes4BCond(nn.Module):
def __init__(self, in_channels, out_channels, downsample, activation, momentum, cond_embedding_dim):
super(EncoderBlockRes4B, self).__init__()
size = (3,3)
self.conv_block1 = ConvBlockResCond(in_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block2 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block3 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block4 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.downsample = downsample
def forward(self, x, cond_vec):
encoder = self.conv_block1(x, cond_vec)
encoder = self.conv_block2(encoder, cond_vec)
encoder = self.conv_block3(encoder, cond_vec)
encoder = self.conv_block4(encoder, cond_vec)
encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
return encoder_pool, encoder
class DecoderBlockRes4BCond(nn.Module):
def __init__(self, in_channels, out_channels, stride, activation, momentum, cond_embedding_dim):
super(DecoderBlockRes4B, self).__init__()
size = (3, 3)
self.activation = activation
self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
out_channels=out_channels, kernel_size=size, stride=stride,
padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
self.bn1 = nn.BatchNorm2d(in_channels)
self.conv_block2 = ConvBlockResCond(out_channels * 2, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block3 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block4 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
self.conv_block5 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
def init_weights(self):
init_layer(self.conv1)
def prune(self, x, both=False):
"""Prune the shape of x after transpose convolution.
"""
if(both): x = x[:, :, 0 : - 1, 0:-1]
else: x = x[:, :, 0: - 1, :]
return x
def forward(self, input_tensor, concat_tensor, cond_vec, both=False):
x = self.conv1(F.relu_(self.bn1(input_tensor)))
x = self.prune(x,both=both)
x = torch.cat((x, concat_tensor), dim=1)
x = self.conv_block2(x, cond_vec)
x = self.conv_block3(x, cond_vec)
x = self.conv_block4(x, cond_vec)
x = self.conv_block5(x, cond_vec)
return x
class EncoderBlockRes4B(nn.Module):
def __init__(self, in_channels, out_channels, downsample, activation, momentum):
super(EncoderBlockRes4B, self).__init__()
size = (3, 3)
self.conv_block1 = ConvBlockRes(in_channels, out_channels, size, activation, momentum)
self.conv_block2 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.downsample = downsample
def forward(self, x):
encoder = self.conv_block1(x)
encoder = self.conv_block2(encoder)
encoder = self.conv_block3(encoder)
encoder = self.conv_block4(encoder)
encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
return encoder_pool, encoder
class DecoderBlockRes4B(nn.Module):
def __init__(self, in_channels, out_channels, stride, activation, momentum):
super(DecoderBlockRes4B, self).__init__()
size = (3,3)
self.activation = activation
self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
out_channels=out_channels, kernel_size=size, stride=stride,
padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
self.bn1 = nn.BatchNorm2d(in_channels)
self.conv_block2 = ConvBlockRes(out_channels * 2, out_channels, size, activation, momentum)
self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
self.conv_block5 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
def init_weights(self):
init_layer(self.conv1)
def prune(self, x, both=False):
"""Prune the shape of x after transpose convolution.
"""
if(both): x = x[:, :, 0 : - 1, 0:-1]
else: x = x[:, :, 0: - 1, :]
return x
def forward(self, input_tensor, concat_tensor,both=False):
x = self.conv1(F.relu_(self.bn1(input_tensor)))
x = self.prune(x,both=both)
x = torch.cat((x, concat_tensor), dim=1)
x = self.conv_block2(x)
x = self.conv_block3(x)
x = self.conv_block4(x)
x = self.conv_block5(x)
return x
class ConvBlockResCond(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, activation, momentum, cond_embedding_dim):
r"""Residual block.
"""
super(ConvBlockResCond, self).__init__()
self.activation = activation
padding = [kernel_size[0] // 2, kernel_size[1] // 2]
self.bn1 = nn.BatchNorm2d(in_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.conv1 = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size, stride=(1, 1),
dilation=(1, 1), padding=padding, bias=False)
self.film1 = Film(channels=out_channels, cond_embedding_dim=cond_embedding_dim)
self.conv2 = nn.Conv2d(in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size, stride=(1, 1),
dilation=(1, 1), padding=padding, bias=False)
self.film2 = Film(channels=out_channels, cond_embedding_dim=cond_embedding_dim)
if in_channels != out_channels:
self.shortcut = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
self.film_res = Film(channels=out_channels, cond_embedding_dim=cond_embedding_dim)
self.is_shortcut = True
else:
self.is_shortcut = False
self.init_weights()
def init_weights(self):
init_bn(self.bn1)
init_bn(self.bn2)
init_layer(self.conv1)
init_layer(self.conv2)
if self.is_shortcut:
init_layer(self.shortcut)
def forward(self, x, cond_vec):
origin = x
x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
x = self.film1(x, cond_vec)
x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
x = self.film2(x, cond_vec)
if self.is_shortcut:
residual = self.shortcut(origin)
residual = self.film_res(residual, cond_vec)
return residual + x
else:
return origin + x
class ConvBlockRes(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, activation, momentum):
r"""Residual block.
"""
super(ConvBlockRes, self).__init__()
self.activation = activation
padding = [kernel_size[0] // 2, kernel_size[1] // 2]
self.bn1 = nn.BatchNorm2d(in_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
self.conv1 = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size, stride=(1, 1),
dilation=(1, 1), padding=padding, bias=False)
self.conv2 = nn.Conv2d(in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size, stride=(1, 1),
dilation=(1, 1), padding=padding, bias=False)
if in_channels != out_channels:
self.shortcut = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
self.is_shortcut = True
else:
self.is_shortcut = False
self.init_weights()
def init_weights(self):
init_bn(self.bn1)
init_bn(self.bn2)
init_layer(self.conv1)
init_layer(self.conv2)
if self.is_shortcut:
init_layer(self.shortcut)
def forward(self, x):
origin = x
x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
if self.is_shortcut:
return self.shortcut(origin) + x
else:
return origin + x
def init_layer(layer):
"""Initialize a Linear or Convolutional layer. """
nn.init.xavier_uniform_(layer.weight)
if hasattr(layer, 'bias'):
if layer.bias is not None:
layer.bias.data.fill_(0.)
def init_bn(bn):
"""Initialize a Batchnorm layer. """
bn.bias.data.fill_(0.)
bn.weight.data.fill_(1.)
def init_gru(rnn):
"""Initialize a GRU layer. """
def _concat_init(tensor, init_funcs):
(length, fan_out) = tensor.shape
fan_in = length // len(init_funcs)
for (i, init_func) in enumerate(init_funcs):
init_func(tensor[i * fan_in: (i + 1) * fan_in, :])
def _inner_uniform(tensor):
fan_in = nn.init._calculate_correct_fan(tensor, 'fan_in')
nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in))
for i in range(rnn.num_layers):
_concat_init(
getattr(rnn, 'weight_ih_l{}'.format(i)),
[_inner_uniform, _inner_uniform, _inner_uniform]
)
torch.nn.init.constant_(getattr(rnn, 'bias_ih_l{}'.format(i)), 0)
_concat_init(
getattr(rnn, 'weight_hh_l{}'.format(i)),
[_inner_uniform, _inner_uniform, nn.init.orthogonal_]
)
torch.nn.init.constant_(getattr(rnn, 'bias_hh_l{}'.format(i)), 0)
def act(x, activation):
if activation == 'relu':
return F.relu_(x)
elif activation == 'leaky_relu':
return F.leaky_relu_(x, negative_slope=0.2)
elif activation == 'swish':
return x * torch.sigmoid(x)
else:
raise Exception('Incorrect activation!')

View File

@@ -0,0 +1,110 @@
from .modules import *
import numpy as np
class UNetRes_FiLM(nn.Module):
def __init__(self, channels, cond_embedding_dim, nsrc=1):
super(UNetRes_FiLM, self).__init__()
activation = 'relu'
momentum = 0.01
self.nsrc = nsrc
self.channels = channels
self.downsample_ratio = 2 ** 6 # This number equals 2^{#encoder_blocks}
self.encoder_block1 = EncoderBlockRes2BCond(in_channels=channels * nsrc, out_channels=32,
downsample=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.encoder_block2 = EncoderBlockRes2BCond(in_channels=32, out_channels=64,
downsample=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.encoder_block3 = EncoderBlockRes2BCond(in_channels=64, out_channels=128,
downsample=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.encoder_block4 = EncoderBlockRes2BCond(in_channels=128, out_channels=256,
downsample=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.encoder_block5 = EncoderBlockRes2BCond(in_channels=256, out_channels=384,
downsample=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.encoder_block6 = EncoderBlockRes2BCond(in_channels=384, out_channels=384,
downsample=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.conv_block7 = ConvBlockResCond(in_channels=384, out_channels=384,
kernel_size=(3, 3), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.decoder_block1 = DecoderBlockRes2BCond(in_channels=384, out_channels=384,
stride=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.decoder_block2 = DecoderBlockRes2BCond(in_channels=384, out_channels=384,
stride=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.decoder_block3 = DecoderBlockRes2BCond(in_channels=384, out_channels=256,
stride=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.decoder_block4 = DecoderBlockRes2BCond(in_channels=256, out_channels=128,
stride=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.decoder_block5 = DecoderBlockRes2BCond(in_channels=128, out_channels=64,
stride=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.decoder_block6 = DecoderBlockRes2BCond(in_channels=64, out_channels=32,
stride=(2, 2), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.after_conv_block1 = ConvBlockResCond(in_channels=32, out_channels=32,
kernel_size=(3, 3), activation=activation, momentum=momentum,
cond_embedding_dim=cond_embedding_dim)
self.after_conv2 = nn.Conv2d(in_channels=32, out_channels=1,
kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=True)
self.init_weights()
def init_weights(self):
init_layer(self.after_conv2)
def forward(self, sp, cond_vec, dec_cond_vec):
"""
Args:
input: sp: (batch_size, channels_num, segment_samples)
Outputs:
output_dict: {
'wav': (batch_size, channels_num, segment_samples),
'sp': (batch_size, channels_num, time_steps, freq_bins)}
"""
x = sp
# Pad spectrogram to be evenly divided by downsample ratio.
origin_len = x.shape[2] # time_steps
pad_len = int(np.ceil(x.shape[2] / self.downsample_ratio)) * self.downsample_ratio - origin_len
x = F.pad(x, pad=(0, 0, 0, pad_len))
x = x[..., 0: x.shape[-1] - 2] # (bs, channels, T, F)
# UNet
(x1_pool, x1) = self.encoder_block1(x, cond_vec) # x1_pool: (bs, 32, T / 2, F / 2)
(x2_pool, x2) = self.encoder_block2(x1_pool, cond_vec) # x2_pool: (bs, 64, T / 4, F / 4)
(x3_pool, x3) = self.encoder_block3(x2_pool, cond_vec) # x3_pool: (bs, 128, T / 8, F / 8)
(x4_pool, x4) = self.encoder_block4(x3_pool, dec_cond_vec) # x4_pool: (bs, 256, T / 16, F / 16)
(x5_pool, x5) = self.encoder_block5(x4_pool, dec_cond_vec) # x5_pool: (bs, 512, T / 32, F / 32)
(x6_pool, x6) = self.encoder_block6(x5_pool, dec_cond_vec) # x6_pool: (bs, 1024, T / 64, F / 64)
x_center = self.conv_block7(x6_pool, dec_cond_vec) # (bs, 2048, T / 64, F / 64)
x7 = self.decoder_block1(x_center, x6, dec_cond_vec) # (bs, 1024, T / 32, F / 32)
x8 = self.decoder_block2(x7, x5, dec_cond_vec) # (bs, 512, T / 16, F / 16)
x9 = self.decoder_block3(x8, x4, cond_vec) # (bs, 256, T / 8, F / 8)
x10 = self.decoder_block4(x9, x3, cond_vec) # (bs, 128, T / 4, F / 4)
x11 = self.decoder_block5(x10, x2, cond_vec) # (bs, 64, T / 2, F / 2)
x12 = self.decoder_block6(x11, x1, cond_vec) # (bs, 32, T, F)
x = self.after_conv_block1(x12, cond_vec) # (bs, 32, T, F)
x = self.after_conv2(x) # (bs, channels, T, F)
# Recover shape
x = F.pad(x, pad=(0, 2))
x = x[:, :, 0: origin_len, :]
return x
if __name__ == "__main__":
model = UNetRes_FiLM(channels=1, cond_embedding_dim=16)
cond_vec = torch.randn((1, 16))
dec_vec = cond_vec
print(model(torch.randn((1, 1, 1001, 513)), cond_vec, dec_vec).size())

View File

@@ -0,0 +1,45 @@
import torch
import torch.nn as nn
from transformers import *
import warnings
warnings.filterwarnings('ignore')
# pretrained model name: (model class, model tokenizer, output dimension, token style)
MODELS = {
'prajjwal1/bert-mini': (BertModel, BertTokenizer),
}
class Text_Encoder(nn.Module):
def __init__(self, device):
super(Text_Encoder, self).__init__()
self.base_model = 'prajjwal1/bert-mini'
self.dropout = 0.1
self.tokenizer = MODELS[self.base_model][1].from_pretrained(self.base_model)
self.bert_layer = MODELS[self.base_model][0].from_pretrained(self.base_model,
add_pooling_layer=False,
hidden_dropout_prob=self.dropout,
attention_probs_dropout_prob=self.dropout,
output_hidden_states=True)
self.linear_layer = nn.Sequential(nn.Linear(256, 256), nn.ReLU(inplace=True))
self.device = device
def tokenize(self, caption):
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenized = self.tokenizer(caption, add_special_tokens=False, padding=True, return_tensors='pt')
input_ids = tokenized['input_ids']
attns_mask = tokenized['attention_mask']
input_ids = input_ids.to(self.device)
attns_mask = attns_mask.to(self.device)
return input_ids, attns_mask
def forward(self, input_ids, attns_mask):
# input_ids, attns_mask = self.tokenize(caption)
output = self.bert_layer(input_ids=input_ids, attention_mask=attns_mask)[0]
cls_embed = output[:, 0, :]
text_embed = self.linear_layer(cls_embed)
return text_embed, output # text_embed: (batch, hidden_size)

View File

@@ -0,0 +1,98 @@
import torch
import numpy as np
def add_noise_and_scale(front, noise, snr_l=0, snr_h=0, scale_lower=1.0, scale_upper=1.0):
"""
:param front: front-head audio, like vocal [samples,channel], will be normlized so any scale will be fine
:param noise: noise, [samples,channel], any scale
:param snr_l: Optional
:param snr_h: Optional
:param scale_lower: Optional
:param scale_upper: Optional
:return: scaled front and noise (noisy = front + noise), all_mel_e2e outputs are noramlized within [-1 , 1]
"""
snr = None
noise, front = normalize_energy_torch(noise), normalize_energy_torch(front) # set noise and vocal to equal range [-1,1]
# print("normalize:",torch.max(noise),torch.max(front))
if snr_l is not None and snr_h is not None:
front, noise, snr = _random_noise(front, noise, snr_l=snr_l, snr_h=snr_h) # remix them with a specific snr
noisy, noise, front = unify_energy_torch(noise + front, noise, front) # normalize noisy, noise and vocal energy into [-1,1]
# print("unify:", torch.max(noise), torch.max(front), torch.max(noisy))
scale = _random_scale(scale_lower, scale_upper) # random scale these three signal
# print("Scale",scale)
noisy, noise, front = noisy * scale, noise * scale, front * scale # apply scale
# print("after scale", torch.max(noisy), torch.max(noise), torch.max(front), snr, scale)
front, noise = _to_numpy(front), _to_numpy(noise) # [num_samples]
mixed_wav = front + noise
return front, noise, mixed_wav, snr, scale
def _random_scale(lower=0.3, upper=0.9):
return float(uniform_torch(lower, upper))
def _random_noise(clean, noise, snr_l=None, snr_h=None):
snr = uniform_torch(snr_l,snr_h)
clean_weight = 10 ** (float(snr) / 20)
return clean, noise/clean_weight, snr
def _to_numpy(wav):
return np.transpose(wav, (1, 0))[0].numpy() # [num_samples]
def normalize_energy(audio, alpha = 1):
'''
:param audio: 1d waveform, [batchsize, *],
:param alpha: the value of output range from: [-alpha,alpha]
:return: 1d waveform which value range from: [-alpha,alpha]
'''
val_max = activelev(audio)
return (audio / val_max) * alpha
def normalize_energy_torch(audio, alpha = 1):
'''
If the signal is almost empty(determined by threshold), if will only be divided by 2**15
:param audio: 1d waveform, 2**15
:param alpha: the value of output range from: [-alpha,alpha]
:return: 1d waveform which value range from: [-alpha,alpha]
'''
val_max = activelev_torch([audio])
return (audio / val_max) * alpha
def unify_energy(*args):
max_amp = activelev(args)
mix_scale = 1.0/max_amp
return [x * mix_scale for x in args]
def unify_energy_torch(*args):
max_amp = activelev_torch(args)
mix_scale = 1.0/max_amp
return [x * mix_scale for x in args]
def activelev(*args):
'''
need to update like matlab
'''
return np.max(np.abs([*args]))
def activelev_torch(*args):
'''
need to update like matlab
'''
res = []
args = args[0]
for each in args:
res.append(torch.max(torch.abs(each)))
return max(res)
def uniform_torch(lower, upper):
if(abs(lower-upper)<1e-5):
return upper
return (upper-lower)*torch.rand(1)+lower
if __name__ == "__main__":
wav1 = torch.randn(1, 32000)
wav2 = torch.randn(1, 32000)
target, noise, snr, scale = add_noise_and_scale(wav1, wav2)

View File

@@ -0,0 +1,159 @@
import torch
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable
from scipy.signal import get_window
import librosa.util as librosa_util
from librosa.util import pad_center, tiny
# from audio_processing import window_sumsquare
def window_sumsquare(window, n_frames, hop_length=512, win_length=1024,
n_fft=1024, dtype=np.float32, norm=None):
"""
# from librosa 0.6
Compute the sum-square envelope of a window function at a given hop length.
This is used to estimate modulation effects induced by windowing
observations in short-time fourier transforms.
Parameters
----------
window : string, tuple, number, callable, or list-like
Window specification, as in `get_window`
n_frames : int > 0
The number of analysis frames
hop_length : int > 0
The number of samples to advance between frames
win_length : [optional]
The length of the window function. By default, this matches `n_fft`.
n_fft : int > 0
The length of each analysis frame.
dtype : np.dtype
The data type of the output
Returns
-------
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
The sum-squared envelope of the window function
"""
if win_length is None:
win_length = n_fft
n = n_fft + hop_length * (n_frames - 1)
x = np.zeros(n, dtype=dtype)
# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
win_sq = librosa_util.pad_center(win_sq, n_fft)
# Fill the envelope
for i in range(n_frames):
sample = i * hop_length
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
return x
class STFT(torch.nn.Module):
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
def __init__(self, filter_length=1024, hop_length=512, win_length=1024,
window='hann'):
super(STFT, self).__init__()
self.filter_length = filter_length
self.hop_length = hop_length
self.win_length = win_length
self.window = window
self.forward_transform = None
scale = self.filter_length / self.hop_length
fourier_basis = np.fft.fft(np.eye(self.filter_length))
cutoff = int((self.filter_length / 2 + 1))
fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
np.imag(fourier_basis[:cutoff, :])])
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
inverse_basis = torch.FloatTensor(
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
if window is not None:
assert(filter_length >= win_length)
# get window and zero center pad it to filter_length
fft_window = get_window(window, win_length, fftbins=True)
fft_window = pad_center(fft_window, filter_length)
fft_window = torch.from_numpy(fft_window).float()
# window the bases
forward_basis *= fft_window
inverse_basis *= fft_window
self.register_buffer('forward_basis', forward_basis.float())
self.register_buffer('inverse_basis', inverse_basis.float())
def transform(self, input_data):
num_batches = input_data.size(0)
num_samples = input_data.size(1)
self.num_samples = num_samples
# similar to librosa, reflect-pad the input
input_data = input_data.view(num_batches, 1, num_samples)
input_data = F.pad(
input_data.unsqueeze(1),
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
mode='reflect')
input_data = input_data.squeeze(1)
forward_transform = F.conv1d(
input_data,
Variable(self.forward_basis, requires_grad=False),
stride=self.hop_length,
padding=0)
cutoff = int((self.filter_length / 2) + 1)
real_part = forward_transform[:, :cutoff, :]
imag_part = forward_transform[:, cutoff:, :]
magnitude = torch.sqrt(real_part**2 + imag_part**2)
phase = torch.autograd.Variable(
torch.atan2(imag_part.data, real_part.data))
return magnitude, phase # [batch_size, F(513), T(1251)]
def inverse(self, magnitude, phase):
recombine_magnitude_phase = torch.cat(
[magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
inverse_transform = F.conv_transpose1d(
recombine_magnitude_phase,
Variable(self.inverse_basis, requires_grad=False),
stride=self.hop_length,
padding=0)
if self.window is not None:
window_sum = window_sumsquare(
self.window, magnitude.size(-1), hop_length=self.hop_length,
win_length=self.win_length, n_fft=self.filter_length,
dtype=np.float32)
# remove modulation effects
approx_nonzero_indices = torch.from_numpy(
np.where(window_sum > tiny(window_sum))[0])
window_sum = torch.autograd.Variable(
torch.from_numpy(window_sum), requires_grad=False)
window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
# scale by hop ratio
inverse_transform *= float(self.filter_length) / self.hop_length
inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
return inverse_transform #[batch_size, 1, sample_num]
def forward(self, input_data):
self.magnitude, self.phase = self.transform(input_data)
reconstruction = self.inverse(self.magnitude, self.phase)
return reconstruction
if __name__ == '__main__':
a = torch.randn(4, 320000)
stft = STFT()
mag, phase = stft.transform(a)
# rec_a = stft.inverse(mag, phase)
print(mag.shape)

View File

@@ -0,0 +1,23 @@
import librosa
import librosa.filters
import math
import numpy as np
import scipy.io.wavfile
def load_wav(path):
max_length = 32000 * 10
wav = librosa.core.load(path, sr=32000)[0]
if len(wav) > max_length:
audio = wav[0:max_length]
# pad audio to max length, 10s for AudioCaps
if len(wav) < max_length:
# audio = torch.nn.functional.pad(audio, (0, self.max_length - audio.size(1)), 'constant')
wav = np.pad(wav, (0, max_length - len(wav)), 'constant')
wav = wav[...,None]
return wav
def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
scipy.io.wavfile.write(path, 32000, wav.astype(np.int16))