diff --git a/audio-chatgpt.py b/audio-chatgpt.py
index 609ab0f..1314a5e 100644
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -6,6 +6,8 @@ sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'Neura
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
 import gradio as gr
 import matplotlib
 import librosa
@@ -37,7 +39,18 @@ from inference.tts.PortaSpeech import TTSInference
 from utils.hparams import set_hparams
 from utils.hparams import hparams as hp
 import scipy.io.wavfile as wavfile
-
+import librosa
+from audio_infer.utils import config as detection_config
+from audio_infer.pytorch.models import PVT
+from src.models import BinauralNetwork
+from sound_extraction.model.LASSNet import LASSNet
+from sound_extraction.utils.stft import STFT
+from sound_extraction.utils.wav_io import load_wav, save_wav
+from target_sound_detection.src import models as tsd_models
+from target_sound_detection.src.models import event_labels
+from target_sound_detection.src.utils import median_filter, decode_with_timestamps
+import clip
+import numpy as np
 AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
 AUdio ChatGPT can not directly read audios, but it has a list of tools to finish different audio synthesis tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, Audio ChatGPT is very strict to the file name and will never fabricate nonexistent files. 
 AUdio ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
@@ -511,6 +524,261 @@ class A2T:
         caption_text = self.model(audio)
         return caption_text[0]
 
+class SoundDetection:
+    def __init__(self, device):
+        self.device = device
+        self.sample_rate = 32000
+        self.window_size = 1024
+        self.hop_size = 320
+        self.mel_bins = 64
+        self.fmin = 50
+        self.fmax = 14000
+        self.model_type = 'PVT'
+        self.checkpoint_path = 'audio_detection/audio_infer/useful_ckpts/audio_detection.pth'
+        self.classes_num = detection_config.classes_num
+        self.labels = detection_config.labels
+        self.frames_per_second = self.sample_rate // self.hop_size
+        # Model = eval(self.model_type)
+        self.model = PVT(sample_rate=self.sample_rate, window_size=self.window_size, 
+            hop_size=self.hop_size, mel_bins=self.mel_bins, fmin=self.fmin, fmax=self.fmax, 
+            classes_num=self.classes_num)
+        checkpoint = torch.load(self.checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint['model'])
+        self.model.to(device)
+
+    def inference(self, audio_path):
+        # Forward
+        (waveform, _) = librosa.core.load(audio_path, sr=self.sample_rate, mono=True)
+        waveform = waveform[None, :]    # (1, audio_length)
+        waveform = torch.from_numpy(waveform)
+        waveform = waveform.to(self.device)
+        # Forward
+        with torch.no_grad():
+            self.model.eval()
+            batch_output_dict = self.model(waveform, None)
+        framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
+        """(time_steps, classes_num)"""
+        # print('Sound event detection result (time_steps x classes_num): {}'.format(
+        #     framewise_output.shape))
+        import numpy as np
+        import matplotlib.pyplot as plt
+        sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
+        top_k = 10  # Show top results
+        top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]    
+        """(time_steps, top_k)"""
+        # Plot result    
+        stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=self.window_size, 
+            hop_length=self.hop_size, window='hann', center=True)
+        frames_num = stft.shape[-1]
+        fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
+        axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
+        axs[0].set_ylabel('Frequency bins')
+        axs[0].set_title('Log spectrogram')
+        axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
+        axs[1].xaxis.set_ticks(np.arange(0, frames_num, self.frames_per_second))
+        axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / self.frames_per_second))
+        axs[1].yaxis.set_ticks(np.arange(0, top_k))
+        axs[1].yaxis.set_ticklabels(np.array(self.labels)[sorted_indexes[0 : top_k]])
+        axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
+        axs[1].set_xlabel('Seconds')
+        axs[1].xaxis.set_ticks_position('bottom')
+        plt.tight_layout()
+        image_filename = os.path.join('image', str(uuid.uuid4())[0:8] + ".png")
+        plt.savefig(image_filename)
+        return image_filename
+
+class SoundExtraction:
+    def __init__(self, device):
+        self.device = device
+        self.model_file = 'sound_extraction/useful_ckpts/LASSNet.pt'
+        self.stft = STFT()
+        import torch.nn as nn
+        self.model = nn.DataParallel(LASSNet(device)).to(device)
+        checkpoint = torch.load(self.model_file)
+        self.model.load_state_dict(checkpoint['model'])
+        self.model.eval()
+
+    def inference(self, inputs):
+        #key = ['ref_audio', 'text']
+        val = inputs.split(",")
+        audio_path = val[0] # audio_path, text
+        text = val[1]
+        waveform = load_wav(audio_path)
+        waveform = torch.tensor(waveform).transpose(1,0)
+        mixed_mag, mixed_phase = self.stft.transform(waveform)
+        text_query = ['[CLS] ' + text]
+        mixed_mag = mixed_mag.transpose(2,1).unsqueeze(0).to(self.device)
+        est_mask = self.model(mixed_mag, text_query)
+        est_mag = est_mask * mixed_mag  
+        est_mag = est_mag.squeeze(1)  
+        est_mag = est_mag.permute(0, 2, 1) 
+        est_wav = self.stft.inverse(est_mag.cpu().detach(), mixed_phase)
+        est_wav = est_wav.squeeze(0).squeeze(0).numpy()  
+        #est_path = f'output/est{i}.wav'
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        print('audio_filename ', audio_filename)
+        save_wav(est_wav, audio_filename)
+        return audio_filename
+
+
+class Binaural:
+    def __init__(self, device):
+        self.device = device
+        self.model_file = 'mono2binaural/useful_ckpts/m2b/binaural_network.net'
+        self.position_file = ['mono2binaural/useful_ckpts/m2b/tx_positions.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions2.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions3.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions4.txt',
+                              'mono2binaural/useful_ckpts/m2b/tx_positions5.txt']
+        self.net = BinauralNetwork(view_dim=7,
+                      warpnet_layers=4,
+                      warpnet_channels=64,
+                      )
+        self.net.load_from_file(self.model_file)
+        self.sr = 48000
+    def inference(self, audio_path):
+        mono, sr  = librosa.load(path=audio_path, sr=self.sr, mono=True)
+        mono = torch.from_numpy(mono)
+        mono = mono.unsqueeze(0)
+        import numpy as np
+        import random
+        rand_int = random.randint(0,4)
+        view = np.loadtxt(self.position_file[rand_int]).transpose().astype(np.float32)
+        view = torch.from_numpy(view)
+        if not view.shape[-1] * 400 == mono.shape[-1]:
+            mono = mono[:,:(mono.shape[-1]//400)*400] # 
+            if view.shape[1]*400 > mono.shape[1]:
+                m_a = view.shape[1] - mono.shape[-1]//400 
+                rand_st = random.randint(0,m_a)
+                view = view[:,m_a:m_a+(mono.shape[-1]//400)] # 
+        # binauralize and save output
+        self.net.eval().to(self.device)
+        mono, view = mono.to(self.device), view.to(self.device)
+        chunk_size = 48000  # forward in chunks of 1s
+        rec_field =  1000  # add 1000 samples as "safe bet" since warping has undefined rec. field
+        rec_field -= rec_field % 400  # make sure rec_field is a multiple of 400 to match audio and view frequencies
+        chunks = [
+            {
+                "mono": mono[:, max(0, i-rec_field):i+chunk_size],
+                "view": view[:, max(0, i-rec_field)//400:(i+chunk_size)//400]
+            }
+            for i in range(0, mono.shape[-1], chunk_size)
+        ]
+        for i, chunk in enumerate(chunks):
+            with torch.no_grad():
+                mono = chunk["mono"].unsqueeze(0)
+                view = chunk["view"].unsqueeze(0)
+                binaural = self.net(mono, view).squeeze(0)
+                if i > 0:
+                    binaural = binaural[:, -(mono.shape[-1]-rec_field):]
+                chunk["binaural"] = binaural
+        binaural = torch.cat([chunk["binaural"] for chunk in chunks], dim=-1)
+        binaural = torch.clamp(binaural, min=-1, max=1).cpu()
+        #binaural = chunked_forwarding(net, mono, view)
+        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
+        import torchaudio
+        torchaudio.save(audio_filename, binaural, sr)
+        #soundfile.write(audio_filename, binaural, samplerate = 48000)
+        print(f"Processed Binaural.run, audio_filename: {audio_filename}")
+        return audio_filename
+
+class TargetSoundDetection:
+    def __init__(self, device):
+        self.device = device
+        self.MEL_ARGS = {
+            'n_mels': 64,
+            'n_fft': 2048,
+            'hop_length': int(22050 * 20 / 1000),
+            'win_length': int(22050 * 40 / 1000)
+        }
+        self.EPS = np.spacing(1)
+        self.clip_model, _ = clip.load("ViT-B/32", device=self.device)
+        self.event_labels = event_labels
+        self.id_to_event =  {i : label for i, label in enumerate(self.event_labels)}
+        config = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/run_config.pth', map_location='cpu')
+        config_parameters = dict(config)
+        config_parameters['tao'] = 0.6
+        if 'thres' not in config_parameters.keys():
+            config_parameters['thres'] = 0.5
+        if 'time_resolution' not in config_parameters.keys():
+            config_parameters['time_resolution'] = 125
+        model_parameters = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/run_model_7_loss=-0.0724.pt'
+                                        , map_location=lambda storage, loc: storage) # load parameter 
+        self.model = getattr(tsd_models, config_parameters['model'])(config_parameters,
+                    inputdim=64, outputdim=2, time_resolution=config_parameters['time_resolution'], **config_parameters['model_args'])
+        self.model.load_state_dict(model_parameters)
+        self.model = self.model.to(self.device).eval()
+        self.re_embeds = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/text_emb.pth')
+        self.ref_mel = torch.load('audio_detection/target_sound_detection/useful_ckpts/tsd/ref_mel.pth')
+
+    def extract_feature(self, fname):
+        import soundfile as sf
+        y, sr = sf.read(fname, dtype='float32')
+        print('y ', y.shape)
+        ti = y.shape[0]/sr
+        if y.ndim > 1:
+            y = y.mean(1)
+        y = librosa.resample(y, sr, 22050)
+        lms_feature = np.log(librosa.feature.melspectrogram(y, **self.MEL_ARGS) + self.EPS).T
+        return lms_feature,ti
+    
+    def build_clip(self, text):
+        text = clip.tokenize(text).to(self.device) # ["a diagram with dog", "a dog", "a cat"]
+        text_features = self.clip_model.encode_text(text)
+        return text_features
+    
+    def cal_similarity(self, target, retrievals):
+        ans = []
+        #target =torch.from_numpy(target)
+        for name in retrievals.keys():
+            tmp = retrievals[name]
+            #tmp = torch.from_numpy(tmp)
+            s = torch.cosine_similarity(target.squeeze(), tmp.squeeze(), dim=0)
+            ans.append(s.item())
+        return ans.index(max(ans))
+    
+    def inference(self, text, audio_path):
+        target_emb = self.build_clip(text) # torch type
+        idx = self.cal_similarity(target_emb, self.re_embeds)
+        target_event = self.id_to_event[idx]
+        embedding = self.ref_mel[target_event]
+        embedding = torch.from_numpy(embedding)
+        embedding = embedding.unsqueeze(0).to(self.device).float()
+        #print('embedding ', embedding.shape)
+        inputs,ti = self.extract_feature(audio_path)
+        #print('ti ', ti)
+        inputs = torch.from_numpy(inputs)
+        inputs = inputs.unsqueeze(0).to(self.device).float()
+        #print('inputs ', inputs.shape)
+        decision, decision_up, logit = self.model(inputs, embedding)
+        pred = decision_up.detach().cpu().numpy()
+        pred = pred[:,:,0]
+        frame_num = decision_up.shape[1]
+        time_ratio = ti / frame_num
+        filtered_pred = median_filter(pred, window_size=1, threshold=0.5)
+        #print('filtered_pred ', filtered_pred)
+        time_predictions = []
+        for index_k in range(filtered_pred.shape[0]):
+            decoded_pred = []
+            decoded_pred_ = decode_with_timestamps(target_event, filtered_pred[index_k,:])
+            if len(decoded_pred_) == 0: # neg deal
+                decoded_pred_.append((target_event, 0, 0))
+            decoded_pred.append(decoded_pred_)
+            for num_batch in range(len(decoded_pred)): # when we test our model,the batch_size is 1
+                cur_pred = pred[num_batch]
+                # Save each frame output, for later visualization
+                label_prediction = decoded_pred[num_batch] # frame predict
+                # print(label_prediction)
+                for event_label, onset, offset in label_prediction:
+                    time_predictions.append({
+                        'onset': onset*time_ratio,
+                        'offset': offset*time_ratio,})
+        ans = ''
+        for i,item in enumerate(time_predictions):
+            ans = ans + 'segment' + str(i+1) + ' start_time: ' + str(item['onset']) + '  end_time: ' + str(item['offset']) + '\t'
+        #print(ans)
+        return ans
+
 class ConversationBot:
     def __init__(self):
         print("Initializing AudioChatGPT")
@@ -525,6 +793,10 @@ class ConversationBot:
         self.asr = ASR(device="cuda:1")
         self.inpaint = Inpaint(device="cuda:0")
         self.tts_ood = TTS_OOD(device="cuda:0")
+        self.detection = SoundDetection(device="cuda:0")
+        self.binaural = Binaural(device="cuda:1")
+        self.extraction = SoundExtraction(device="cuda:0")
+        self.TSD = TargetSoundDetection(device="cuda:1")
         self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
         self.tools = [
             Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
@@ -561,7 +833,19 @@ class ConversationBot:
                              "The input to this tool should be a string, representing the audio_path."),
             Tool(name="Transcribe speech", func=self.asr.inference,
                  description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
-                             "The input to this tool should be a string, representing the audio_path.")]
+                             "The input to this tool should be a string, representing the audio_path."),
+            Tool(name="Detect the sound event from the audio", func=self.detection.inference,
+                 description="useful for when you want to know what event in the audio and the sound event start or end time, receives audio_path as input. "
+                             "The input to this tool should be a string, representing the audio_path. "),
+            Tool(name="Sythesize binaural audio from a mono audio input", func=self.binaural.inference,
+                 description="useful for when you want to transfer your mono audio into binaural audio, receives audio_path as input. "
+                             "The input to this tool should be a string, representing the audio_path. "),
+            Tool(name="Extract sound event from mixture audio based on language description", func=self.extraction.inference,
+                 description="useful for when you extract target sound from a mixture audio, you can describe the taregt sound by text, receives audio_path and text as input. "
+                             "The input to this tool should be a comma seperated string of two, representing mixture audio path and input text."),
+            Tool(name="Detect the sound event from the audio based on your descriptions", func=self.TSD.inference,
+                 description="useful for when you want to know the when happens the target sound event in th audio. You can use language descriptions to instruct the model. receives text description and audio_path as input. "
+                             "The input to this tool should be a string, representing the answer. ")]
         self.agent = initialize_agent(
             self.tools,
             self.llm,
diff --git a/audio_detection/__init__.py b/audio_detection/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/audio_detection/audio_infer/__init__.py b/audio_detection/audio_infer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv
new file mode 100644
index 0000000..48d8522
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_evaluation_set.csv
@@ -0,0 +1,1350 @@
+-JMT0mK0Dbg_30.000_40.000.wav	30.000	40.000	Train horn
+3ACjUf9QpAQ_30.000_40.000.wav	30.000	40.000	Train horn
+3S2-TODd__k_90.000_100.000.wav	90.000	100.000	Train horn
+3YJewEC-NWo_30.000_40.000.wav	30.000	40.000	Train horn
+3jXAh3V2FO8_30.000_40.000.wav	30.000	40.000	Train horn
+53oq_Otm_XI_30.000_40.000.wav	30.000	40.000	Train horn
+8IaInXpdd9M_0.000_10.000.wav	0.000	10.000	Train horn
+8nU1aVscJec_30.000_40.000.wav	30.000	40.000	Train horn
+9LQEZJPNVpw_30.000_40.000.wav	30.000	40.000	Train horn
+AHom7lBbtoY_30.000_40.000.wav	30.000	40.000	Train horn
+Ag_zT74ZGNc_9.000_19.000.wav	9.000	19.000	Train horn
+BQpa8whzwAE_30.000_40.000.wav	30.000	40.000	Train horn
+CCX_4cW_SAU_0.000_10.000.wav	0.000	10.000	Train horn
+CLIdVCUO_Vw_30.000_40.000.wav	30.000	40.000	Train horn
+D_nXtMgbPNY_30.000_40.000.wav	30.000	40.000	Train horn
+GFQnh84kNwU_30.000_40.000.wav	30.000	40.000	Train horn
+I4qODX0fypE_30.000_40.000.wav	30.000	40.000	Train horn
+IdqEbjujFb8_30.000_40.000.wav	30.000	40.000	Train horn
+L3a132_uApg_50.000_60.000.wav	50.000	60.000	Train horn
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Train horn
+MCYY8tJsnfY_7.000_17.000.wav	7.000	17.000	Train horn
+MPSf7dJpV5w_30.000_40.000.wav	30.000	40.000	Train horn
+NdCr5IDnkxc_30.000_40.000.wav	30.000	40.000	Train horn
+P54KKbTA_TE_0.000_7.000.wav	0.000	7.000	Train horn
+PJUy17bXlhc_40.000_50.000.wav	40.000	50.000	Train horn
+QrAoRSA13bM_30.000_40.000.wav	30.000	40.000	Train horn
+R_Lpb-51Kl4_30.000_40.000.wav	30.000	40.000	Train horn
+Rq-22Cycrpg_30.000_40.000.wav	30.000	40.000	Train horn
+TBjrN1aMRrM_30.000_40.000.wav	30.000	40.000	Train horn
+XAUtk9lwzU8_30.000_40.000.wav	30.000	40.000	Train horn
+XW8pSKLyr0o_20.000_30.000.wav	20.000	30.000	Train horn
+Y10I9JSvJuQ_30.000_40.000.wav	30.000	40.000	Train horn
+Y_jwEflLthg_190.000_200.000.wav	190.000	200.000	Train horn
+YilfKdY7w6Y_60.000_70.000.wav	60.000	70.000	Train horn
+ZcTI8fQgEZE_240.000_250.000.wav	240.000	250.000	Train horn
+_8MvhMlbwiE_40.000_50.000.wav	40.000	50.000	Train horn
+_dkeW6lqmq4_30.000_40.000.wav	30.000	40.000	Train horn
+aXsUHAKbyLs_30.000_40.000.wav	30.000	40.000	Train horn
+arevYmB0qGg_30.000_40.000.wav	30.000	40.000	Train horn
+d1o334I5X_k_30.000_40.000.wav	30.000	40.000	Train horn
+dSzZWgbJ378_30.000_40.000.wav	30.000	40.000	Train horn
+ePVb5Upev8k_40.000_50.000.wav	40.000	50.000	Train horn
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Train horn
+g9JVq7wfDIo_30.000_40.000.wav	30.000	40.000	Train horn
+gTFCK9TuLOQ_30.000_40.000.wav	30.000	40.000	Train horn
+hYqzr_rIIAw_30.000_40.000.wav	30.000	40.000	Train horn
+iZgzRfa-xPQ_30.000_40.000.wav	30.000	40.000	Train horn
+k8H8rn4NaSM_0.000_10.000.wav	0.000	10.000	Train horn
+lKQ-I_P7TEM_20.000_30.000.wav	20.000	30.000	Train horn
+nfY_zkJceDw_30.000_40.000.wav	30.000	40.000	Train horn
+pW5SI1ZKUpA_30.000_40.000.wav	30.000	40.000	Train horn
+pxmrmtEnROk_30.000_40.000.wav	30.000	40.000	Train horn
+q7zzKHFWGkg_30.000_40.000.wav	30.000	40.000	Train horn
+qu8vVFWKszA_30.000_40.000.wav	30.000	40.000	Train horn
+stdjjG6Y5IU_30.000_40.000.wav	30.000	40.000	Train horn
+tdRMxc4UWRk_30.000_40.000.wav	30.000	40.000	Train horn
+tu-cxDG2mW8_0.000_10.000.wav	0.000	10.000	Train horn
+txXSE7kgrc8_30.000_40.000.wav	30.000	40.000	Train horn
+xabrKa79prM_30.000_40.000.wav	30.000	40.000	Train horn
+yBVxtq9k8Sg_0.000_10.000.wav	0.000	10.000	Train horn
+-WoudI3gGvk_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+0_gci63CtFY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+3NX4HaOVBoo_240.000_250.000.wav	240.000	250.000	Air horn, truck horn
+9NPKQDaNCRk_0.000_6.000.wav	0.000	6.000	Air horn, truck horn
+9ct4w4aYWdc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+9l9QXgsJSfo_120.000_130.000.wav	120.000	130.000	Air horn, truck horn
+CN0Bi4MDpA4_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+CU2MyVM_B48_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Cg-DWc9nPfQ_90.000_100.000.wav	90.000	100.000	Air horn, truck horn
+D62L3husEa0_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+GO2zKyMtBV4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Ge_KWS-0098_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+Hk7HqLBHWng_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+IpyingiCwV8_0.000_3.000.wav	0.000	3.000	Air horn, truck horn
+Isuh9pOuH6I_300.000_310.000.wav	300.000	310.000	Air horn, truck horn
+IuTfMfzkr5Y_120.000_130.000.wav	120.000	130.000	Air horn, truck horn
+MFxsgcZZtFs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+N3osL4QmOL8_49.000_59.000.wav	49.000	59.000	Air horn, truck horn
+NOZsDTFLm7M_0.000_9.000.wav	0.000	9.000	Air horn, truck horn
+OjVY3oM1jEU_40.000_50.000.wav	40.000	50.000	Air horn, truck horn
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+TYLZuBBu8ms_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+UdHR1P_NIbo_110.000_120.000.wav	110.000	120.000	Air horn, truck horn
+YilfKdY7w6Y_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+Yt4ZWNjvJOY_50.000_60.000.wav	50.000	60.000	Air horn, truck horn
+Z5M3fGT3Xjk_60.000_70.000.wav	60.000	70.000	Air horn, truck horn
+ZauRsP1uH74_12.000_22.000.wav	12.000	22.000	Air horn, truck horn
+a_6CZ2JaEuc_0.000_2.000.wav	0.000	2.000	Air horn, truck horn
+b7m5Kt5U7Vc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+bIObkrK06rk_15.000_25.000.wav	15.000	25.000	Air horn, truck horn
+cdrjKqyDrak_420.000_430.000.wav	420.000	430.000	Air horn, truck horn
+ckSYn557ZyE_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+cs-RPPsg_ks_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+ctsq33oUBT8_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+eCFUwyU9ZWA_9.000_19.000.wav	9.000	19.000	Air horn, truck horn
+ePVb5Upev8k_40.000_50.000.wav	40.000	50.000	Air horn, truck horn
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+gjlo4evwjlE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Air horn, truck horn
+ieZVo7W3BQ4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+jko48cNdvFA_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+kUrb38hMwPs_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+km_hVyma2vo_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+m1e9aOwRiDQ_0.000_9.000.wav	0.000	9.000	Air horn, truck horn
+mQJcObz1k_E_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+pk75WDyNZKc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Air horn, truck horn
+suuYwAifIAQ_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+wDdEZ46B-tM_460.000_470.000.wav	460.000	470.000	Air horn, truck horn
+wHISHmuP58s_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+xwqIKDz1bT4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+y4Ko6VNiqB0_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+yhcmPrU3QSk_61.000_71.000.wav	61.000	71.000	Air horn, truck horn
+3FWHjjZGT9U_80.000_90.000.wav	80.000	90.000	Car alarm
+3YChVhqW42E_130.000_140.000.wav	130.000	140.000	Car alarm
+3YRkin3bMlQ_170.000_180.000.wav	170.000	180.000	Car alarm
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Car alarm
+4JDah6Ckr9k_5.000_15.000.wav	5.000	15.000	Car alarm
+5hL1uGb4sas_30.000_40.000.wav	30.000	40.000	Car alarm
+969Zfj4IoPk_20.000_30.000.wav	20.000	30.000	Car alarm
+AyfuBDN3Vdw_40.000_50.000.wav	40.000	50.000	Car alarm
+B-ZqhRg3km4_60.000_70.000.wav	60.000	70.000	Car alarm
+BDnwA3AaclE_10.000_20.000.wav	10.000	20.000	Car alarm
+ES-rjFfuxq4_120.000_130.000.wav	120.000	130.000	Car alarm
+EWbZq5ruCpg_0.000_10.000.wav	0.000	10.000	Car alarm
+F50h9HiyC3k_40.000_50.000.wav	40.000	50.000	Car alarm
+F5AP8kQvogM_30.000_40.000.wav	30.000	40.000	Car alarm
+FKJuDOAumSk_20.000_30.000.wav	20.000	30.000	Car alarm
+GmbNjZi4xBw_30.000_40.000.wav	30.000	40.000	Car alarm
+H7lOMlND9dc_30.000_40.000.wav	30.000	40.000	Car alarm
+Hu8lxbHYaqg_40.000_50.000.wav	40.000	50.000	Car alarm
+IziTYkSwq9Q_30.000_40.000.wav	30.000	40.000	Car alarm
+JcO2TTtiplA_30.000_40.000.wav	30.000	40.000	Car alarm
+KKx7dWRg8s8_8.000_18.000.wav	8.000	18.000	Car alarm
+Kf9Kr69mwOA_14.000_24.000.wav	14.000	24.000	Car alarm
+L535vIV3ED4_40.000_50.000.wav	40.000	50.000	Car alarm
+LOjT44tFx1A_0.000_10.000.wav	0.000	10.000	Car alarm
+Mxn2FKuNwiI_20.000_30.000.wav	20.000	30.000	Car alarm
+Nkqx09b-xyI_70.000_80.000.wav	70.000	80.000	Car alarm
+QNKo1W1WRbc_22.000_32.000.wav	22.000	32.000	Car alarm
+R0VxYDfjyAU_60.000_70.000.wav	60.000	70.000	Car alarm
+TJ58vMpSy1w_30.000_40.000.wav	30.000	40.000	Car alarm
+ToU1kRagUjY_0.000_10.000.wav	0.000	10.000	Car alarm
+TrQGIZqrW0s_30.000_40.000.wav	30.000	40.000	Car alarm
+ULFhHR0OLSE_30.000_40.000.wav	30.000	40.000	Car alarm
+ULS3ffQkCW4_30.000_40.000.wav	30.000	40.000	Car alarm
+U_9NuNORYQM_1.000_11.000.wav	1.000	11.000	Car alarm
+UkCEuwYUW8c_110.000_120.000.wav	110.000	120.000	Car alarm
+Wak5QxsS-QU_30.000_40.000.wav	30.000	40.000	Car alarm
+XzE7mp3pVik_0.000_10.000.wav	0.000	10.000	Car alarm
+Y-4dtrP-RNo_7.000_17.000.wav	7.000	17.000	Car alarm
+Zltlj0fDeS4_30.000_40.000.wav	30.000	40.000	Car alarm
+cB1jkzgH2es_150.000_160.000.wav	150.000	160.000	Car alarm
+eIMjkADTWzA_60.000_70.000.wav	60.000	70.000	Car alarm
+eL7s5CoW0UA_0.000_7.000.wav	0.000	7.000	Car alarm
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Car alarm
+iWl-5LNURFc_30.000_40.000.wav	30.000	40.000	Car alarm
+iX34nDCq9NU_10.000_20.000.wav	10.000	20.000	Car alarm
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Car alarm
+l6_h_YHuTbY_30.000_40.000.wav	30.000	40.000	Car alarm
+lhedRVb85Fk_30.000_40.000.wav	30.000	40.000	Car alarm
+monelE7hnwI_20.000_30.000.wav	20.000	30.000	Car alarm
+o2CmtHNUrXg_30.000_40.000.wav	30.000	40.000	Car alarm
+pXX6cK4xtiY_11.000_21.000.wav	11.000	21.000	Car alarm
+stnVta2ip9g_30.000_40.000.wav	30.000	40.000	Car alarm
+uvuVg9Cl0n0_30.000_40.000.wav	30.000	40.000	Car alarm
+vF2zXcbADUk_20.000_30.000.wav	20.000	30.000	Car alarm
+vN7dJyt-nj0_20.000_30.000.wav	20.000	30.000	Car alarm
+w8Md65mE5Vc_30.000_40.000.wav	30.000	40.000	Car alarm
+ySqfMcFk5LM_30.000_40.000.wav	30.000	40.000	Car alarm
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Car alarm
+za8KPcQ0dTw_30.000_40.000.wav	30.000	40.000	Car alarm
+-2sE5CH8Wb8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-fJsZm3YRc0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-oSzD8P2BtU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-pzwalZ0ub0_5.000_15.000.wav	5.000	15.000	Reversing beeps
+-t-htrAtNvM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-zNEcuo28oE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+077aWlQn6XI_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0O-gZoirpRA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+10aF24rMeu0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+1P5FFxXLSpY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+1n_s2Gb5R1Q_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2HZcxlRs-hg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2Jpg_KvJWL0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+2WTk_j_fivY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+38F6eeIR-s0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+3xh2kScw64U_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4MIHbR4QZhE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4Tpy1lsfcSM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4XMY2IvVSf0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4ep09nZl3LA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4t1VqRz4w2g_30.000_40.000.wav	30.000	40.000	Reversing beeps
+4tKvAMmAUMM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5-x2pk3YYAs_11.000_21.000.wav	11.000	21.000	Reversing beeps
+5DW8WjxxCag_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5DjZHCumLfs_11.000_21.000.wav	11.000	21.000	Reversing beeps
+5V0xKS-FGMk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+5fLzQegwHUg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6Y8bKS6KLeE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6xEHP-C-ZuU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+6yyToq9cW9A_60.000_70.000.wav	60.000	70.000	Reversing beeps
+7Gua0-UrKIw_30.000_40.000.wav	30.000	40.000	Reversing beeps
+7nglQSmcjAk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+81DteAPIhoE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+96a4smrM_30_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9EsgN-WS2qY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9OcAwC8y-eQ_30.000_40.000.wav	30.000	40.000	Reversing beeps
+9Ti98L4PRCo_17.000_27.000.wav	17.000	27.000	Reversing beeps
+9yhMtJ50sys_30.000_40.000.wav	30.000	40.000	Reversing beeps
+A9KMqwqLboE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+AFwmMFq_xlc_390.000_400.000.wav	390.000	400.000	Reversing beeps
+AvhBRiwWJU4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+CL5vkiMs2c0_10.000_20.000.wav	10.000	20.000	Reversing beeps
+DcU6AzN7imA_210.000_220.000.wav	210.000	220.000	Reversing beeps
+ISBJKY8hwnM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+LA5TekLaIPI_10.000_20.000.wav	10.000	20.000	Reversing beeps
+NqzZbJJl3E4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+PSt0xAYgf4g_0.000_10.000.wav	0.000	10.000	Reversing beeps
+Q1CMSV81_ws_30.000_40.000.wav	30.000	40.000	Reversing beeps
+_gG0KNGD47M_30.000_40.000.wav	30.000	40.000	Reversing beeps
+ckt7YEGcSoY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+eIkUuCRE_0U_30.000_40.000.wav	30.000	40.000	Reversing beeps
+kH6fFjIZkB0_30.000_40.000.wav	30.000	40.000	Reversing beeps
+mCJ0aqIygWE_24.000_34.000.wav	24.000	34.000	Reversing beeps
+nFqf1vflJaI_350.000_360.000.wav	350.000	360.000	Reversing beeps
+nMaSkwx6cHE_30.000_40.000.wav	30.000	40.000	Reversing beeps
+oHKTmTLEy68_11.000_21.000.wav	11.000	21.000	Reversing beeps
+saPU2JNoytU_0.000_10.000.wav	0.000	10.000	Reversing beeps
+tQd0vFueRKs_30.000_40.000.wav	30.000	40.000	Reversing beeps
+vzP6soELj2Q_0.000_10.000.wav	0.000	10.000	Reversing beeps
+0x82_HySIVU_30.000_40.000.wav	30.000	40.000	Bicycle
+1IQdvfm9SDY_30.000_40.000.wav	30.000	40.000	Bicycle
+1_hGvbEiYAs_30.000_40.000.wav	30.000	40.000	Bicycle
+26CM8IXODG4_2.000_12.000.wav	2.000	12.000	Bicycle
+2f7Ad-XpbnY_30.000_40.000.wav	30.000	40.000	Bicycle
+3-a8i_MEUl8_30.000_40.000.wav	30.000	40.000	Bicycle
+7KiTXYwaD04_7.000_17.000.wav	7.000	17.000	Bicycle
+7gkjn-LLInI_30.000_40.000.wav	30.000	40.000	Bicycle
+84flVacRHUI_21.000_31.000.wav	21.000	31.000	Bicycle
+9VziOIkNXsE_30.000_40.000.wav	30.000	40.000	Bicycle
+ANofTuuN0W0_160.000_170.000.wav	160.000	170.000	Bicycle
+B6n0op0sLPA_30.000_40.000.wav	30.000	40.000	Bicycle
+D4_zTwsCRds_60.000_70.000.wav	60.000	70.000	Bicycle
+DEs_Sp9S1Nw_30.000_40.000.wav	30.000	40.000	Bicycle
+GjsxrMRRdfQ_3.000_13.000.wav	3.000	13.000	Bicycle
+GkpUU3VX4wQ_30.000_40.000.wav	30.000	40.000	Bicycle
+H9HNXYxRmv8_30.000_40.000.wav	30.000	40.000	Bicycle
+HPWRKwrs-rY_370.000_380.000.wav	370.000	380.000	Bicycle
+HrQxbNO5jXU_6.000_16.000.wav	6.000	16.000	Bicycle
+IYaEZkAO0LU_30.000_40.000.wav	30.000	40.000	Bicycle
+Idzfy0XbZRo_7.000_17.000.wav	7.000	17.000	Bicycle
+Iigfz_GeXVs_30.000_40.000.wav	30.000	40.000	Bicycle
+JWCtQ_94YoQ_30.000_40.000.wav	30.000	40.000	Bicycle
+JXmBrD4b4EI_30.000_40.000.wav	30.000	40.000	Bicycle
+LSZPNwZex9s_30.000_40.000.wav	30.000	40.000	Bicycle
+M5kwg1kx4q0_30.000_40.000.wav	30.000	40.000	Bicycle
+NrR1wmCpqAk_12.000_22.000.wav	12.000	22.000	Bicycle
+O1_Rw2dHb1I_2.000_12.000.wav	2.000	12.000	Bicycle
+OEN0TySl1Jw_10.000_20.000.wav	10.000	20.000	Bicycle
+PF7uY9ydMYc_30.000_40.000.wav	30.000	40.000	Bicycle
+SDl0tWf9Q44_30.000_40.000.wav	30.000	40.000	Bicycle
+SkXXjcw9sJI_30.000_40.000.wav	30.000	40.000	Bicycle
+Ssa1m5Mnllw_0.000_9.000.wav	0.000	9.000	Bicycle
+UB-A1oyNyyg_0.000_6.000.wav	0.000	6.000	Bicycle
+UqyvFyQthHo_30.000_40.000.wav	30.000	40.000	Bicycle
+Wg4ik5zZxBc_250.000_260.000.wav	250.000	260.000	Bicycle
+WvquSD2PcCE_30.000_40.000.wav	30.000	40.000	Bicycle
+YIJBuXUi64U_30.000_40.000.wav	30.000	40.000	Bicycle
+aBHdl_TiseI_30.000_40.000.wav	30.000	40.000	Bicycle
+aeHCq6fFkNo_30.000_40.000.wav	30.000	40.000	Bicycle
+amKDjVcs1Vg_30.000_40.000.wav	30.000	40.000	Bicycle
+ehYwty_G2L4_13.000_23.000.wav	13.000	23.000	Bicycle
+jOlVJv7jAHg_30.000_40.000.wav	30.000	40.000	Bicycle
+lGFDQ-ZwUfk_30.000_40.000.wav	30.000	40.000	Bicycle
+lmTHvLGQy3g_50.000_60.000.wav	50.000	60.000	Bicycle
+nNHW3Uxlb-g_30.000_40.000.wav	30.000	40.000	Bicycle
+o98R4ruf8kw_30.000_40.000.wav	30.000	40.000	Bicycle
+oiLHBkHgkAo_0.000_8.000.wav	0.000	8.000	Bicycle
+qL0ESQcaPhM_30.000_40.000.wav	30.000	40.000	Bicycle
+qjz5t9M4YCw_30.000_40.000.wav	30.000	40.000	Bicycle
+qrCWPsqG9vA_30.000_40.000.wav	30.000	40.000	Bicycle
+r06tmeUDgc8_3.000_13.000.wav	3.000	13.000	Bicycle
+sAMjMyCdGOc_30.000_40.000.wav	30.000	40.000	Bicycle
+tKdRlWz-1pg_30.000_40.000.wav	30.000	40.000	Bicycle
+uNpSMpqlkMA_0.000_10.000.wav	0.000	10.000	Bicycle
+vOYj9W7Jsxk_8.000_18.000.wav	8.000	18.000	Bicycle
+xBKrmKdjAIA_0.000_10.000.wav	0.000	10.000	Bicycle
+xfNeZaw4o3U_17.000_27.000.wav	17.000	27.000	Bicycle
+xgiJqbhhU3c_30.000_40.000.wav	30.000	40.000	Bicycle
+0vg9qxNKXOw_30.000_40.000.wav	30.000	40.000	Skateboard
+10YXuv9Go0E_140.000_150.000.wav	140.000	150.000	Skateboard
+3-a8i_MEUl8_30.000_40.000.wav	30.000	40.000	Skateboard
+6kXUG1Zo6VA_0.000_10.000.wav	0.000	10.000	Skateboard
+84fDGWoRtsU_210.000_220.000.wav	210.000	220.000	Skateboard
+8kbHA22EWd0_330.000_340.000.wav	330.000	340.000	Skateboard
+8m-a_6wLTkU_230.000_240.000.wav	230.000	240.000	Skateboard
+9QwaP-cvdeU_360.000_370.000.wav	360.000	370.000	Skateboard
+9ZYj5toEbGA_0.000_10.000.wav	0.000	10.000	Skateboard
+9gkppwB5CXA_30.000_40.000.wav	30.000	40.000	Skateboard
+9hlXgXWXYXQ_0.000_6.000.wav	0.000	6.000	Skateboard
+ALxn5-2bVyI_30.000_40.000.wav	30.000	40.000	Skateboard
+ANPjV_rudog_30.000_40.000.wav	30.000	40.000	Skateboard
+ATAL-_Dblvg_0.000_7.000.wav	0.000	7.000	Skateboard
+An-4jPvUT14_60.000_70.000.wav	60.000	70.000	Skateboard
+BGR0QnX4k6w_30.000_40.000.wav	30.000	40.000	Skateboard
+BlhUt8AJJO8_30.000_40.000.wav	30.000	40.000	Skateboard
+CD7INyI79fM_170.000_180.000.wav	170.000	180.000	Skateboard
+CNcxzB9F-Q8_100.000_110.000.wav	100.000	110.000	Skateboard
+DqOGYyFVnKk_200.000_210.000.wav	200.000	210.000	Skateboard
+E0gBwPTHxqE_30.000_40.000.wav	30.000	40.000	Skateboard
+E3XIdP8kxwg_110.000_120.000.wav	110.000	120.000	Skateboard
+FQZnQhiM41U_0.000_6.000.wav	0.000	6.000	Skateboard
+FRwFfq3Tl1g_310.000_320.000.wav	310.000	320.000	Skateboard
+JJo971B_eDg_30.000_40.000.wav	30.000	40.000	Skateboard
+KXkxqxoCylc_30.000_40.000.wav	30.000	40.000	Skateboard
+L4Z7XkS6CtA_30.000_40.000.wav	30.000	40.000	Skateboard
+LjEqr0Z7xm0_0.000_6.000.wav	0.000	6.000	Skateboard
+MAbDEeLF4cQ_30.000_40.000.wav	30.000	40.000	Skateboard
+MUBbiivNYZs_30.000_40.000.wav	30.000	40.000	Skateboard
+Nq8GyBrTI8Y_30.000_40.000.wav	30.000	40.000	Skateboard
+PPq9QZmV7jc_25.000_35.000.wav	25.000	35.000	Skateboard
+PVgL5wFOKMs_30.000_40.000.wav	30.000	40.000	Skateboard
+Tcq_xAdCMr4_30.000_40.000.wav	30.000	40.000	Skateboard
+UtZofZjccBs_290.000_300.000.wav	290.000	300.000	Skateboard
+VZfrDZhI7BU_30.000_40.000.wav	30.000	40.000	Skateboard
+WxChkRrVOIs_0.000_7.000.wav	0.000	7.000	Skateboard
+YV0noe1sZAs_150.000_160.000.wav	150.000	160.000	Skateboard
+YjScrri_F7U_0.000_10.000.wav	0.000	10.000	Skateboard
+YrGQKTbiG1g_30.000_40.000.wav	30.000	40.000	Skateboard
+ZM67kt6G-d4_30.000_40.000.wav	30.000	40.000	Skateboard
+ZaUaqnLdg6k_30.000_40.000.wav	30.000	40.000	Skateboard
+ZhpkRcAEJzc_3.000_13.000.wav	3.000	13.000	Skateboard
+_43OOP6UEw0_30.000_40.000.wav	30.000	40.000	Skateboard
+_6Fyave4jqA_260.000_270.000.wav	260.000	270.000	Skateboard
+aOoZ0bCoaZw_30.000_40.000.wav	30.000	40.000	Skateboard
+gV6y9L24wWg_0.000_10.000.wav	0.000	10.000	Skateboard
+hHb0Eq1I7Fk_0.000_10.000.wav	0.000	10.000	Skateboard
+lGf_L6i6AZI_20.000_30.000.wav	20.000	30.000	Skateboard
+leOH87itNWM_30.000_40.000.wav	30.000	40.000	Skateboard
+mIkW7mWlnXw_30.000_40.000.wav	30.000	40.000	Skateboard
+qadmKrM0ppo_20.000_30.000.wav	20.000	30.000	Skateboard
+rLUIHCc4b9A_0.000_7.000.wav	0.000	7.000	Skateboard
+u3vBJgEVJvk_0.000_10.000.wav	0.000	10.000	Skateboard
+vHKBrtPDSvA_150.000_160.000.wav	150.000	160.000	Skateboard
+wWmydRt0Z-w_21.000_31.000.wav	21.000	31.000	Skateboard
+xeHt-R5ScmI_0.000_10.000.wav	0.000	10.000	Skateboard
+xqGtIVeeXY4_330.000_340.000.wav	330.000	340.000	Skateboard
+y_lfY0uzmr0_30.000_40.000.wav	30.000	40.000	Skateboard
+02Ak1eIyj3M_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+0N0C0Wbe6AI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+7eeN-fXbso8_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+8qMHvgA9mGw_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+9CRb-PToaAM_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+BGp9-Ro5h8Y_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+CDrpqsGqfPo_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+Cc7-P0py1Mc_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+Daqv2F6SEmQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+F9Dbcxr-lAI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+LNQ7fzfdLiY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+MEUcv-QM0cQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+QWVub6-0jX4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+R8G5Y0HASxY_60.000_70.000.wav	60.000	70.000	Ambulance (siren)
+RVTKY5KR3ME_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+Sm0pPvXPA9U_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+VXI3-DI4xNs_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+W8fIlauyJkk_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+ZxlbI2Rj1VY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ZyuX_gMFiss_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+bA8mt0JI0Ko_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+cHm1cYBAXMI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+cR79KnWpiQA_70.000_80.000.wav	70.000	80.000	Ambulance (siren)
+dPcw4R5lczw_500.000_510.000.wav	500.000	510.000	Ambulance (siren)
+epwDz5WBkvc_80.000_90.000.wav	80.000	90.000	Ambulance (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+gw9pYEG2Zb0_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+iEX8L_oEbsU_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+iSnWMz4FUAg_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+kSjvt2Z_pBo_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ke35yF1LHs4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+lqGtL8sUo_g_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+mAfPu0meA_Y_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+mlS9LLiMIG8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+oPR7tUEUptk_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+qsHc2X1toLs_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+s0iddDFzL9s_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+tcKlq7_cOkw_8.000_18.000.wav	8.000	18.000	Ambulance (siren)
+u3yYpMwG4Us_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vBXPyBiyJG0_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vVqUvv1SSu8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+zbiJEml563w_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+-HxRz4w60-Y_150.000_160.000.wav	150.000	160.000	Fire engine, fire truck (siren)
+-_dElQcyJnA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+0K1mroXg8bs_9.000_19.000.wav	9.000	19.000	Fire engine, fire truck (siren)
+0SvSNVatkv0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+31WGUPOYS5g_22.000_32.000.wav	22.000	32.000	Fire engine, fire truck (siren)
+3h3_IZWhX0g_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+4APBvMmKubU_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+5fjy_2ajEkg_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Fire engine, fire truck (siren)
+ARIVxBOc0BQ_40.000_50.000.wav	40.000	50.000	Fire engine, fire truck (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Bs2KqqI9F_k_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Cc7-P0py1Mc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+D4M3YT75ZrQ_90.000_100.000.wav	90.000	100.000	Fire engine, fire truck (siren)
+DWXQ_cSUW98_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Daqv2F6SEmQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+DpagxUQwXDo_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+FFSI6Bg2M-Q_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GbIuxmaiCOk_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+H6c8ZDrdUaM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+HQQxGJKg1iM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+IiCh2H3JtsE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+InrS4Fdndr4_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+MEUcv-QM0cQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Fire engine, fire truck (siren)
+VXI3-DI4xNs_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+Xggsbzzes3M_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+YbiiaDBU-HI_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+ZeH6Fc7Y900_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+cHm1cYBAXMI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+k2a30--j37Q_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+kr8ssbrDDMY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+pvYwIdGrS90_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+qsHc2X1toLs_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+u9aHjYGbl5o_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+uUiZrgUpw2A_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vBXPyBiyJG0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vVqUvv1SSu8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+wD0P-doqkXo_20.000_30.000.wav	20.000	30.000	Fire engine, fire truck (siren)
+xbr7x2V6mxk_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+ysNK5RVF3Zw_0.000_10.000.wav	0.000	10.000	Fire engine, fire truck (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+zpzJKMG5iGc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+02Ak1eIyj3M_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0CJFt950vOk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0phl6nlC-n0_10.000_20.000.wav	10.000	20.000	Civil defense siren
+1jhbNtCWC9w_50.000_60.000.wav	50.000	60.000	Civil defense siren
+4Ukj2TTJxHM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+4XAVaSz_P7c_150.000_160.000.wav	150.000	160.000	Civil defense siren
+69AIBPnJN5E_0.000_10.000.wav	0.000	10.000	Civil defense siren
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Civil defense siren
+8ILgvaJVPCI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+9MWHXCLAX8I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+A5y-aZc0CiM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+AQCZH4OdNSM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+AVBUh6qeHrQ_30.000_40.000.wav	30.000	40.000	Civil defense siren
+BhQPDafekdw_30.000_40.000.wav	30.000	40.000	Civil defense siren
+CJXNdudcJrs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+CU2MyVM_B48_30.000_40.000.wav	30.000	40.000	Civil defense siren
+DdZw0XDv0JI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+DgWHUawAGnI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+Do9Dffb6vHA_30.000_40.000.wav	30.000	40.000	Civil defense siren
+GO2zKyMtBV4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+GeRgy4of730_30.000_40.000.wav	30.000	40.000	Civil defense siren
+IIypdzgZAaI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Civil defense siren
+JqHJ7015aWM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+K7a1P4RX_5w_30.000_40.000.wav	30.000	40.000	Civil defense siren
+KrTocA-I550_190.000_200.000.wav	190.000	200.000	Civil defense siren
+KumYcZVLOVU_350.000_360.000.wav	350.000	360.000	Civil defense siren
+L60HS_jbZu0_30.000_40.000.wav	30.000	40.000	Civil defense siren
+MZ1Yh6mRC-E_30.000_40.000.wav	30.000	40.000	Civil defense siren
+R8XUrRCFkzs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+SyWbolNFst4_60.000_70.000.wav	60.000	70.000	Civil defense siren
+TYLZuBBu8ms_0.000_10.000.wav	0.000	10.000	Civil defense siren
+Tx6eSkU2lKc_30.000_40.000.wav	30.000	40.000	Civil defense siren
+VcflBZLflSU_130.000_140.000.wav	130.000	140.000	Civil defense siren
+WXsTHg_DiYA_30.000_40.000.wav	30.000	40.000	Civil defense siren
+Wz5ffJxCElQ_10.000_20.000.wav	10.000	20.000	Civil defense siren
+X2MlmcY8UZU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+XYLheTmlEYI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+YyxlD_FwZXM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+adCuLs-4nmI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+cPjtrTq3F-I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+eHDm93tI4Ok_30.000_40.000.wav	30.000	40.000	Civil defense siren
+etppP5Sdo14_30.000_40.000.wav	30.000	40.000	Civil defense siren
+fRKxUc1gQBw_50.000_60.000.wav	50.000	60.000	Civil defense siren
+feIue4LHzfM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+gr-Yen6Sj_Q_0.000_10.000.wav	0.000	10.000	Civil defense siren
+hl3Kqi9Wi_g_30.000_40.000.wav	30.000	40.000	Civil defense siren
+iKca2cbowd4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+kzFyGWdj6MI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+m3LGopSVju4_30.000_40.000.wav	30.000	40.000	Civil defense siren
+ne4IMxs-hMk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+nuu2iNisoQc_6.000_16.000.wav	6.000	16.000	Civil defense siren
+oYeql9xE19k_30.000_40.000.wav	30.000	40.000	Civil defense siren
+rGUrM19BnJ8_110.000_120.000.wav	110.000	120.000	Civil defense siren
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+uCRAnDBXxgI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+vQG4HZR2KSk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+vjsG5b2yNzc_190.000_200.000.wav	190.000	200.000	Civil defense siren
+yO7guxGY-_k_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-9GUUhB3QV0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-HxRz4w60-Y_150.000_160.000.wav	150.000	160.000	Police car (siren)
+-UBVqmhbT50_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-_dElQcyJnA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0N0C0Wbe6AI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0SvSNVatkv0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+145N68nh4m0_120.000_130.000.wav	120.000	130.000	Police car (siren)
+2-h8MRSRvEg_30.000_40.000.wav	30.000	40.000	Police car (siren)
+31WGUPOYS5g_22.000_32.000.wav	22.000	32.000	Police car (siren)
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Police car (siren)
+6rXgD5JlYxY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+8Aq2DyLbUBA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+8DaEd5KbnnA_80.000_90.000.wav	80.000	90.000	Police car (siren)
+8E7okHnCcTA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+9CRb-PToaAM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+9OFUd38sBNM_0.000_8.000.wav	0.000	8.000	Police car (siren)
+AQCZH4OdNSM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Police car (siren)
+CDrpqsGqfPo_10.000_20.000.wav	10.000	20.000	Police car (siren)
+DK_6C29B2zs_14.000_24.000.wav	14.000	24.000	Police car (siren)
+GORjnSWhZeY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+GgV0yYogTPI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H6c8ZDrdUaM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H7lOMlND9dc_30.000_40.000.wav	30.000	40.000	Police car (siren)
+H9xQQVv3ElI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+IiCh2H3JtsE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+InrS4Fdndr4_0.000_10.000.wav	0.000	10.000	Police car (siren)
+JgDuU9kpHpM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+JpLA7HY9r3Y_30.000_40.000.wav	30.000	40.000	Police car (siren)
+LNQ7fzfdLiY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Police car (siren)
+QWVub6-0jX4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+Wak5QxsS-QU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+YbiiaDBU-HI_10.000_20.000.wav	10.000	20.000	Police car (siren)
+Z34SD-OEpJI_10.000_20.000.wav	10.000	20.000	Police car (siren)
+ZeH6Fc7Y900_30.000_40.000.wav	30.000	40.000	Police car (siren)
+ZlS4vIWQMmE_0.000_10.000.wav	0.000	10.000	Police car (siren)
+ZyuX_gMFiss_30.000_40.000.wav	30.000	40.000	Police car (siren)
+bIU0X1v4SF0_30.000_40.000.wav	30.000	40.000	Police car (siren)
+eIMjkADTWzA_60.000_70.000.wav	60.000	70.000	Police car (siren)
+epwDz5WBkvc_80.000_90.000.wav	80.000	90.000	Police car (siren)
+fHaQPHCjyfA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+fNcrlqPrAqM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+g_DBLppDZAs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+gw9pYEG2Zb0_20.000_30.000.wav	20.000	30.000	Police car (siren)
+iEX8L_oEbsU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+iM-U56fTTOQ_30.000_40.000.wav	30.000	40.000	Police car (siren)
+kJuvA2zmrnY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+kSjvt2Z_pBo_30.000_40.000.wav	30.000	40.000	Police car (siren)
+lqGtL8sUo_g_30.000_40.000.wav	30.000	40.000	Police car (siren)
+mAfPu0meA_Y_20.000_30.000.wav	20.000	30.000	Police car (siren)
+mlS9LLiMIG8_30.000_40.000.wav	30.000	40.000	Police car (siren)
+pzup58Eyhuo_30.000_40.000.wav	30.000	40.000	Police car (siren)
+rCQykaL8Hy4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+rhUfN81puDI_0.000_10.000.wav	0.000	10.000	Police car (siren)
+u08iA12iAmM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+u3yYpMwG4Us_30.000_40.000.wav	30.000	40.000	Police car (siren)
+u9aHjYGbl5o_30.000_40.000.wav	30.000	40.000	Police car (siren)
+uUiZrgUpw2A_30.000_40.000.wav	30.000	40.000	Police car (siren)
+vYKWnuvq2FI_30.000_40.000.wav	30.000	40.000	Police car (siren)
+xbr7x2V6mxk_30.000_40.000.wav	30.000	40.000	Police car (siren)
+z4B14tAqJ4w_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-FKrYTj_eCU_0.000_10.000.wav	0.000	10.000	Screaming
+0G50t4FlbIA_60.000_70.000.wav	60.000	70.000	Screaming
+1LTxZ2aNytc_30.000_40.000.wav	30.000	40.000	Screaming
+2FEhG1UXb_E_370.000_380.000.wav	370.000	380.000	Screaming
+45vBbOhzS6g_50.000_60.000.wav	50.000	60.000	Screaming
+4PYTtp78Ig0_60.000_70.000.wav	60.000	70.000	Screaming
+5QNq0IEPICQ_30.000_40.000.wav	30.000	40.000	Screaming
+5YcIJuYQECc_0.000_6.000.wav	0.000	6.000	Screaming
+5kQF4r03yRI_0.000_6.000.wav	0.000	6.000	Screaming
+7ARVgI_wx5Y_30.000_40.000.wav	30.000	40.000	Screaming
+AIFvFuZPr68_30.000_40.000.wav	30.000	40.000	Screaming
+Aw43FUCkIb8_20.000_30.000.wav	20.000	30.000	Screaming
+AxM2BofYfPY_30.000_40.000.wav	30.000	40.000	Screaming
+BFqHyCoypfM_16.000_26.000.wav	16.000	26.000	Screaming
+Bk_xS_fKCpk_30.000_40.000.wav	30.000	40.000	Screaming
+C4YMjmJ7tt4_90.000_100.000.wav	90.000	100.000	Screaming
+CMWoAvgD0A0_9.000_19.000.wav	9.000	19.000	Screaming
+DZfYFhywhRs_30.000_40.000.wav	30.000	40.000	Screaming
+ElJFYwRtrH4_30.000_40.000.wav	30.000	40.000	Screaming
+FcUVtXJMkJs_30.000_40.000.wav	30.000	40.000	Screaming
+G--718JDmAQ_0.000_10.000.wav	0.000	10.000	Screaming
+GPJ1uQwmNHk_30.000_40.000.wav	30.000	40.000	Screaming
+H3vSRzkG82U_30.000_40.000.wav	30.000	40.000	Screaming
+HS28EUWt8dE_110.000_120.000.wav	110.000	120.000	Screaming
+KkGTB8ESMCM_0.000_10.000.wav	0.000	10.000	Screaming
+MQ0YasvMcuQ_1.000_11.000.wav	1.000	11.000	Screaming
+Msl9dI5yweA_90.000_100.000.wav	90.000	100.000	Screaming
+Ntn6YvZM3kA_0.000_10.000.wav	0.000	10.000	Screaming
+NwTHlpXdk4M_30.000_40.000.wav	30.000	40.000	Screaming
+OHjfSfqa804_0.000_10.000.wav	0.000	10.000	Screaming
+OzWJuqG2F3Y_30.000_40.000.wav	30.000	40.000	Screaming
+QDW_uCMnMMU_0.000_8.000.wav	0.000	8.000	Screaming
+SxI3Lnzzmkw_110.000_120.000.wav	110.000	120.000	Screaming
+TVvbfuGu9eM_70.000_80.000.wav	70.000	80.000	Screaming
+YCk9F0Uq3BE_70.000_80.000.wav	70.000	80.000	Screaming
+Z54pSnNw2iM_30.000_40.000.wav	30.000	40.000	Screaming
+a59ivTlYoNk_310.000_320.000.wav	310.000	320.000	Screaming
+auC_LgwFF8g_30.000_40.000.wav	30.000	40.000	Screaming
+bi8R9JbF2cc_80.000_90.000.wav	80.000	90.000	Screaming
+cdbYsoEasio_70.000_80.000.wav	70.000	80.000	Screaming
+dfsvT5xImNg_80.000_90.000.wav	80.000	90.000	Screaming
+e2AaF6siR1A_540.000_550.000.wav	540.000	550.000	Screaming
+gB1ytjgpcW4_190.000_200.000.wav	190.000	200.000	Screaming
+gE-0JxMtUh0_20.000_30.000.wav	20.000	30.000	Screaming
+hWiGgsuGnzs_100.000_110.000.wav	100.000	110.000	Screaming
+l-iIfi3SNpw_120.000_130.000.wav	120.000	130.000	Screaming
+mT-f0lGk-JM_30.000_40.000.wav	30.000	40.000	Screaming
+nApE_Biu13k_10.000_20.000.wav	10.000	20.000	Screaming
+nRMmafPUAEU_80.000_90.000.wav	80.000	90.000	Screaming
+nYAbLuyqPis_30.000_40.000.wav	30.000	40.000	Screaming
+nlYlNF30bVg_30.000_40.000.wav	30.000	40.000	Screaming
+sUp-UXzgmrA_0.000_10.000.wav	0.000	10.000	Screaming
+syIwNMo2TUA_0.000_7.000.wav	0.000	7.000	Screaming
+uTu0a1wd9-M_21.000_31.000.wav	21.000	31.000	Screaming
+xVG7dfH5DL0_320.000_330.000.wav	320.000	330.000	Screaming
+xvAQ44hx3_k_220.000_230.000.wav	220.000	230.000	Screaming
+yNTkb2zgA_M_70.000_80.000.wav	70.000	80.000	Screaming
+zCdOEvduBTo_30.000_40.000.wav	30.000	40.000	Screaming
+zMICvbCJ6zc_550.000_560.000.wav	550.000	560.000	Screaming
+-0RWZT-miFs_420.000_430.000.wav	420.000	430.000	Car
+-1pRmoJIGQc_11.000_21.000.wav	11.000	21.000	Car
+-7eDqv-6AKQ_30.000_40.000.wav	30.000	40.000	Car
+-CZ1LIc8aos_20.000_30.000.wav	20.000	30.000	Car
+-HWygXWSNRA_30.000_40.000.wav	30.000	40.000	Car
+-PVEno65928_30.000_40.000.wav	30.000	40.000	Car
+-WgJ-M292Yc_30.000_40.000.wav	30.000	40.000	Car
+0O-gZoirpRA_30.000_40.000.wav	30.000	40.000	Car
+0QwxnzHf_0E_30.000_40.000.wav	30.000	40.000	Car
+0bg1nzEVdgY_0.000_10.000.wav	0.000	10.000	Car
+0lpPdWvg7Eo_0.000_10.000.wav	0.000	10.000	Car
+11Pn3yJifSQ_4.000_14.000.wav	4.000	14.000	Car
+1BgqrhbyRFw_30.000_40.000.wav	30.000	40.000	Car
+1F9zCsJyw6k_430.000_440.000.wav	430.000	440.000	Car
+1HayoASR-54_80.000_90.000.wav	80.000	90.000	Car
+1P5FFxXLSpY_30.000_40.000.wav	30.000	40.000	Car
+1hIg-Lsvc7Q_30.000_40.000.wav	30.000	40.000	Car
+27m49pmJ8Og_370.000_380.000.wav	370.000	380.000	Car
+2E_N8lnoVKE_30.000_40.000.wav	30.000	40.000	Car
+2Fdau5KTEls_30.000_40.000.wav	30.000	40.000	Car
+2STASUlGAjs_30.000_40.000.wav	30.000	40.000	Car
+2fi0m8ei_B4_30.000_40.000.wav	30.000	40.000	Car
+2uMXfAIMeN0_180.000_190.000.wav	180.000	190.000	Car
+32V2zsK7GME_110.000_120.000.wav	110.000	120.000	Car
+3YChVhqW42E_130.000_140.000.wav	130.000	140.000	Car
+3_OLj6XChvM_30.000_40.000.wav	30.000	40.000	Car
+3hLxPQpmfQo_30.000_40.000.wav	30.000	40.000	Car
+3mDPQ_CPopw_30.000_40.000.wav	30.000	40.000	Car
+3mor5mPSYoU_7.000_17.000.wav	7.000	17.000	Car
+3xh2kScw64U_30.000_40.000.wav	30.000	40.000	Car
+40s88hEcn5I_170.000_180.000.wav	170.000	180.000	Car
+42P93B_GzGA_30.000_40.000.wav	30.000	40.000	Car
+4KZWpXlcpM4_60.000_70.000.wav	60.000	70.000	Car
+4TshFWSsrn8_290.000_300.000.wav	290.000	300.000	Car
+4WRgvRI06zc_30.000_40.000.wav	30.000	40.000	Car
+4aJfQpHt9lY_160.000_170.000.wav	160.000	170.000	Car
+4hd2CLrzCZs_30.000_40.000.wav	30.000	40.000	Car
+4zCHl7pRsNY_30.000_40.000.wav	30.000	40.000	Car
+5RgHBmX2HLw_30.000_40.000.wav	30.000	40.000	Car
+5oirFKi6Sfo_190.000_200.000.wav	190.000	200.000	Car
+5vmxFp1r1ZM_30.000_40.000.wav	30.000	40.000	Car
+5z1rE_l-0Ow_0.000_8.000.wav	0.000	8.000	Car
+620GoTv5Ic8_30.000_40.000.wav	30.000	40.000	Car
+6BitLl5Bnxw_30.000_40.000.wav	30.000	40.000	Car
+6FVA4hqp1Ro_30.000_40.000.wav	30.000	40.000	Car
+6U942AYlcXA_30.000_40.000.wav	30.000	40.000	Car
+6b2ZMMrLTz8_5.000_15.000.wav	5.000	15.000	Car
+6ibh38autyA_30.000_40.000.wav	30.000	40.000	Car
+6kuESYFcEqw_30.000_40.000.wav	30.000	40.000	Car
+73cuZZq-J3w_20.000_30.000.wav	20.000	30.000	Car
+764IcMEMVUk_90.000_100.000.wav	90.000	100.000	Car
+7NH1WJlSiYI_30.000_40.000.wav	30.000	40.000	Car
+7lJu9wEsErY_220.000_230.000.wav	220.000	230.000	Car
+8CqqK9CzuXM_30.000_40.000.wav	30.000	40.000	Car
+8SYLYWR47EE_30.000_40.000.wav	30.000	40.000	Car
+8Wk-ZmlsUqY_28.000_38.000.wav	28.000	38.000	Car
+8q8JrJNAa-Q_30.000_40.000.wav	30.000	40.000	Car
+8rMlNbKlp_s_0.000_10.000.wav	0.000	10.000	Car
+8sGJFPr2Nmc_30.000_40.000.wav	30.000	40.000	Car
+8yRROnG0-lA_30.000_40.000.wav	30.000	40.000	Car
+9Ti98L4PRCo_17.000_27.000.wav	17.000	27.000	Car
+9fzAWj5YJ9c_30.000_40.000.wav	30.000	40.000	Car
+9rq8h4oMJ98_30.000_40.000.wav	30.000	40.000	Car
+9ye2Fn62xDc_60.000_70.000.wav	60.000	70.000	Car
+ACGuC6SH4V4_150.000_160.000.wav	150.000	160.000	Car
+AFz5TIs_Gug_30.000_40.000.wav	30.000	40.000	Car
+AedlWfHafgw_21.000_31.000.wav	21.000	31.000	Car
+AlsDSDTiaWI_30.000_40.000.wav	30.000	40.000	Car
+B3SkK0wuOhY_130.000_140.000.wav	130.000	140.000	Car
+B9n4a5ciI48_16.000_26.000.wav	16.000	26.000	Car
+BAekfGvUtFM_30.000_40.000.wav	30.000	40.000	Car
+BNLOvQbrPdc_290.000_300.000.wav	290.000	300.000	Car
+BS1fqEDAvh0_330.000_340.000.wav	330.000	340.000	Car
+Bqx_SZgCzZw_10.000_20.000.wav	10.000	20.000	Car
+CZB6WXDuM1g_30.000_40.000.wav	30.000	40.000	Car
+C_pnsyNXphA_30.000_40.000.wav	30.000	40.000	Car
+Ck5ZjBf1nLM_30.000_40.000.wav	30.000	40.000	Car
+CqNyeZeHb8Y_30.000_40.000.wav	30.000	40.000	Car
+Cs1d7Ibk8CA_220.000_230.000.wav	220.000	230.000	Car
+CuS-ok0xG9g_0.000_10.000.wav	0.000	10.000	Car
+CuaBHNKycvI_30.000_40.000.wav	30.000	40.000	Car
+Cwur_jvxMzY_360.000_370.000.wav	360.000	370.000	Car
+DEGSyVygE98_110.000_120.000.wav	110.000	120.000	Car
+DLxTYAUifjU_30.000_40.000.wav	30.000	40.000	Car
+DkKpnvJk9u0_30.000_40.000.wav	30.000	40.000	Car
+DkVfro9iq80_30.000_40.000.wav	30.000	40.000	Car
+Dw1q9rBv7oU_30.000_40.000.wav	30.000	40.000	Car
+E8NgxTz1d90_30.000_40.000.wav	30.000	40.000	Car
+ExqedxdXuBc_70.000_80.000.wav	70.000	80.000	Car
+FCxEMSNSEuI_160.000_170.000.wav	160.000	170.000	Car
+FEoMTMxzn3U_30.000_40.000.wav	30.000	40.000	Car
+FFSWmryaZ60_30.000_40.000.wav	30.000	40.000	Car
+FYk2paHPSdg_30.000_40.000.wav	30.000	40.000	Car
+Fo_FDiZhzDo_30.000_40.000.wav	30.000	40.000	Car
+GteozUDpJRc_30.000_40.000.wav	30.000	40.000	Car
+GwBS2NzjAvA_30.000_40.000.wav	30.000	40.000	Car
+H8d1mZOqb1c_110.000_120.000.wav	110.000	120.000	Car
+HFF_PpqLQ9w_30.000_40.000.wav	30.000	40.000	Car
+HHlb-h2Pc7o_30.000_40.000.wav	30.000	40.000	Car
+Hu8lxbHYaqg_40.000_50.000.wav	40.000	50.000	Car
+I-HlrcP6Qg4_30.000_40.000.wav	30.000	40.000	Car
+I7vs2H-Htt8_480.000_490.000.wav	480.000	490.000	Car
+IblhEF_MiH8_400.000_410.000.wav	400.000	410.000	Car
+JgXnbgS_XBk_480.000_490.000.wav	480.000	490.000	Car
+Ju7Kg_H2iZQ_30.000_40.000.wav	30.000	40.000	Car
+KiCB6pP6EEo_100.000_110.000.wav	100.000	110.000	Car
+Kwpn3utYEHM_30.000_40.000.wav	30.000	40.000	Car
+Ky9Kw-0XwAs_30.000_40.000.wav	30.000	40.000	Car
+KzKDk-UgS54_30.000_40.000.wav	30.000	40.000	Car
+L1qC8DicAZE_70.000_80.000.wav	70.000	80.000	Car
+L4N0LOYZrFo_30.000_40.000.wav	30.000	40.000	Car
+L535vIV3ED4_40.000_50.000.wav	40.000	50.000	Car
+L9YtOeck3A0_0.000_10.000.wav	0.000	10.000	Car
+LEtkHiZZugk_30.000_40.000.wav	30.000	40.000	Car
+LLkNFGrrgUo_30.000_40.000.wav	30.000	40.000	Car
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Car
+M7NvD1WJQ7o_70.000_80.000.wav	70.000	80.000	Car
+M8BFtmQRHq4_200.000_210.000.wav	200.000	210.000	Car
+Mxn2FKuNwiI_20.000_30.000.wav	20.000	30.000	Car
+NMqSBlEq14Q_30.000_40.000.wav	30.000	40.000	Car
+NoPbk9fy6uw_10.000_20.000.wav	10.000	20.000	Car
+O36torHptH4_30.000_40.000.wav	30.000	40.000	Car
+OBwh-KGukE8_30.000_40.000.wav	30.000	40.000	Car
+Oa2Os8eOUjs_30.000_40.000.wav	30.000	40.000	Car
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Car
+PfXdcsW8dJI_540.000_550.000.wav	540.000	550.000	Car
+QAWuHvVCI6g_30.000_40.000.wav	30.000	40.000	Car
+QBMDnMRwQCc_70.000_80.000.wav	70.000	80.000	Car
+QzrS-S7OerE_370.000_380.000.wav	370.000	380.000	Car
+R0BtkTm_CPI_30.000_40.000.wav	30.000	40.000	Car
+SEHxfje9Eio_30.000_40.000.wav	30.000	40.000	Car
+Sb3V17F8xU8_360.000_370.000.wav	360.000	370.000	Car
+SkbFczIabRY_30.000_40.000.wav	30.000	40.000	Car
+SqWkV-UQ6CI_30.000_40.000.wav	30.000	40.000	Car
+TWDytzefXXc_10.000_20.000.wav	10.000	20.000	Car
+Tv67JhZDAYs_30.000_40.000.wav	30.000	40.000	Car
+VTwVF3xRSWg_12.000_22.000.wav	12.000	22.000	Car
+VulCKZgWspc_570.000_580.000.wav	570.000	580.000	Car
+Vx6mttDHWfo_30.000_40.000.wav	30.000	40.000	Car
+W11cJ9HZNaY_30.000_40.000.wav	30.000	40.000	Car
+WLXQgcx8qTI_30.000_40.000.wav	30.000	40.000	Car
+WMbdMQ7rdFs_30.000_40.000.wav	30.000	40.000	Car
+WZoQD6cInx8_360.000_370.000.wav	360.000	370.000	Car
+WffmaOr2p8I_30.000_40.000.wav	30.000	40.000	Car
+WoynilrteLU_30.000_40.000.wav	30.000	40.000	Car
+WxrKq0aI0iM_130.000_140.000.wav	130.000	140.000	Car
+X60eVxecY3I_30.000_40.000.wav	30.000	40.000	Car
+X8fEzx-fA0U_80.000_90.000.wav	80.000	90.000	Car
+XVxlZqwWcBI_10.000_20.000.wav	10.000	20.000	Car
+Xnd8ERrynEo_120.000_130.000.wav	120.000	130.000	Car
+XqXLI7bDb-I_0.000_7.000.wav	0.000	7.000	Car
+XyCjByHuDIk_260.000_270.000.wav	260.000	270.000	Car
+XzE7mp3pVik_0.000_10.000.wav	0.000	10.000	Car
+Y5e8BW513ww_20.000_30.000.wav	20.000	30.000	Car
+YJdBwuIn4Ec_30.000_40.000.wav	30.000	40.000	Car
+YTFJUFWcRns_30.000_40.000.wav	30.000	40.000	Car
+YY9aConw2QE_0.000_10.000.wav	0.000	10.000	Car
+Yc_WuISxfLI_30.000_40.000.wav	30.000	40.000	Car
+Ys_rO2Ieg1U_30.000_40.000.wav	30.000	40.000	Car
+Z34SD-OEpJI_10.000_20.000.wav	10.000	20.000	Car
+Z8cigemT5_g_210.000_220.000.wav	210.000	220.000	Car
+ZJW7ymsioQc_16.000_26.000.wav	16.000	26.000	Car
+ZY6A9ZDkudg_130.000_140.000.wav	130.000	140.000	Car
+_Mw9lKigni4_30.000_40.000.wav	30.000	40.000	Car
+_ZiJA6phEq8_30.000_40.000.wav	30.000	40.000	Car
+_yU0-fmspFY_210.000_220.000.wav	210.000	220.000	Car
+a5vTn5286-A_80.000_90.000.wav	80.000	90.000	Car
+aCX6vJhHO2c_30.000_40.000.wav	30.000	40.000	Car
+aHEAK0iWqKk_180.000_190.000.wav	180.000	190.000	Car
+aOVPHKqKjyQ_90.000_100.000.wav	90.000	100.000	Car
+aUq4glO5ryE_30.000_40.000.wav	30.000	40.000	Car
+aW3DY8XDrmw_22.000_32.000.wav	22.000	32.000	Car
+aa4uhPvKviY_30.000_40.000.wav	30.000	40.000	Car
+akgqVmFFDiY_30.000_40.000.wav	30.000	40.000	Car
+buOEFwXhoe0_310.000_320.000.wav	310.000	320.000	Car
+cHCIoXF7moA_30.000_40.000.wav	30.000	40.000	Car
+cW859JAzVZ0_30.000_40.000.wav	30.000	40.000	Car
+cbYZQRz09bc_390.000_400.000.wav	390.000	400.000	Car
+d-do1XZ8f_E_30.000_40.000.wav	30.000	40.000	Car
+d3gMwtMK6Gs_30.000_40.000.wav	30.000	40.000	Car
+d6AioJ8CkTc_30.000_40.000.wav	30.000	40.000	Car
+dAud19zNZyw_190.000_200.000.wav	190.000	200.000	Car
+dC1TVxwiitc_30.000_40.000.wav	30.000	40.000	Car
+dFqOBLxhEl8_20.000_30.000.wav	20.000	30.000	Car
+dSfcznv4KLo_30.000_40.000.wav	30.000	40.000	Car
+dThSTe35jb0_50.000_60.000.wav	50.000	60.000	Car
+dfwr8wgZU8M_40.000_50.000.wav	40.000	50.000	Car
+dmJH84FnQa8_30.000_40.000.wav	30.000	40.000	Car
+e9xPBfEJni8_230.000_240.000.wav	230.000	240.000	Car
+eAl9WwRaWUE_30.000_40.000.wav	30.000	40.000	Car
+eAt6si6k65c_30.000_40.000.wav	30.000	40.000	Car
+eHiqCLHmoxI_0.000_8.000.wav	0.000	8.000	Car
+eV5JX81GzqA_150.000_160.000.wav	150.000	160.000	Car
+er1vQ-nse_g_30.000_40.000.wav	30.000	40.000	Car
+eyFPHlybqDg_30.000_40.000.wav	30.000	40.000	Car
+f70nsY7ThBA_220.000_230.000.wav	220.000	230.000	Car
+fJLCT3xDGxA_30.000_40.000.wav	30.000	40.000	Car
+fZMPDCNyQxE_30.000_40.000.wav	30.000	40.000	Car
+f__6chtFRM0_30.000_40.000.wav	30.000	40.000	Car
+fdDTuo_COG8_90.000_100.000.wav	90.000	100.000	Car
+gFJjYWXeBn0_30.000_40.000.wav	30.000	40.000	Car
+g_DBLppDZAs_30.000_40.000.wav	30.000	40.000	Car
+gaFQgJLQHtU_90.000_100.000.wav	90.000	100.000	Car
+gc6VlixMHXE_30.000_40.000.wav	30.000	40.000	Car
+hN1ykzC8kZM_30.000_40.000.wav	30.000	40.000	Car
+hQ_yyPI46FI_11.000_21.000.wav	11.000	21.000	Car
+haiMRJEH-Aw_0.000_9.000.wav	0.000	9.000	Car
+hsC_sT0A4XM_30.000_40.000.wav	30.000	40.000	Car
+ihQDd1CqFBw_70.000_80.000.wav	70.000	80.000	Car
+ii87iO6JboA_10.000_20.000.wav	10.000	20.000	Car
+j2R1zurR39E_30.000_40.000.wav	30.000	40.000	Car
+j42ETHcp044_0.000_10.000.wav	0.000	10.000	Car
+j7OEpDiK3IA_30.000_40.000.wav	30.000	40.000	Car
+jCeUZwd8b2w_0.000_10.000.wav	0.000	10.000	Car
+jZxusrD28rM_30.000_40.000.wav	30.000	40.000	Car
+kdDgTDfo9HY_100.000_110.000.wav	100.000	110.000	Car
+l6_h_YHuTbY_30.000_40.000.wav	30.000	40.000	Car
+lRrv5m9Xu4k_30.000_40.000.wav	30.000	40.000	Car
+lb1awXgoyQE_0.000_10.000.wav	0.000	10.000	Car
+llZBUsAwRWc_30.000_40.000.wav	30.000	40.000	Car
+lu5teS1j1RQ_0.000_10.000.wav	0.000	10.000	Car
+mCmjh_EJtb4_30.000_40.000.wav	30.000	40.000	Car
+nFqf1vflJaI_350.000_360.000.wav	350.000	360.000	Car
+njodYtK0Hqg_30.000_40.000.wav	30.000	40.000	Car
+noymXcxyxis_30.000_40.000.wav	30.000	40.000	Car
+o2CmtHNUrXg_30.000_40.000.wav	30.000	40.000	Car
+oPJVdi0cqNE_30.000_40.000.wav	30.000	40.000	Car
+oxJYMzEmtk4_10.000_20.000.wav	10.000	20.000	Car
+pPnLErF3GOY_30.000_40.000.wav	30.000	40.000	Car
+pXX6cK4xtiY_11.000_21.000.wav	11.000	21.000	Car
+qC5M7BAsKOA_0.000_10.000.wav	0.000	10.000	Car
+qg4WxBm8h_w_510.000_520.000.wav	510.000	520.000	Car
+qxLdv8u_Ujw_0.000_5.000.wav	0.000	5.000	Car
+rgeu0Gtf3Es_40.000_50.000.wav	40.000	50.000	Car
+s3-i5eUpe6c_30.000_40.000.wav	30.000	40.000	Car
+s5s3aR8Z7I8_350.000_360.000.wav	350.000	360.000	Car
+syCQldBsAtg_30.000_40.000.wav	30.000	40.000	Car
+tAfucDIyRiM_30.000_40.000.wav	30.000	40.000	Car
+teoER4j9H14_290.000_300.000.wav	290.000	300.000	Car
+uFSkczD2i14_30.000_40.000.wav	30.000	40.000	Car
+uUyB4q7jgn4_30.000_40.000.wav	30.000	40.000	Car
+uYqlVTlSgbM_40.000_50.000.wav	40.000	50.000	Car
+v8Kry1CbTkM_310.000_320.000.wav	310.000	320.000	Car
+vF2zXcbADUk_20.000_30.000.wav	20.000	30.000	Car
+vHlqKDR7ggA_30.000_40.000.wav	30.000	40.000	Car
+vPDXFKcdaS4_0.000_10.000.wav	0.000	10.000	Car
+vW1nk4o9u5g_30.000_40.000.wav	30.000	40.000	Car
+vdFYBSlmsXw_30.000_40.000.wav	30.000	40.000	Car
+vtE1J8HsCUs_30.000_40.000.wav	30.000	40.000	Car
+w0vy1YvNcOg_30.000_40.000.wav	30.000	40.000	Car
+wDKrcZ7xLY8_80.000_90.000.wav	80.000	90.000	Car
+wM-sBzIDzok_30.000_40.000.wav	30.000	40.000	Car
+wUY4eWJt17w_30.000_40.000.wav	30.000	40.000	Car
+we66pU0MN1M_30.000_40.000.wav	30.000	40.000	Car
+wjfMWiYLDWA_30.000_40.000.wav	30.000	40.000	Car
+wu3-_VKULZU_30.000_40.000.wav	30.000	40.000	Car
+wwNIm8bgzKc_30.000_40.000.wav	30.000	40.000	Car
+xqH9TpH6Xy0_0.000_10.000.wav	0.000	10.000	Car
+xsT5ZJUnBg0_160.000_170.000.wav	160.000	170.000	Car
+y9DFJEsiTLk_110.000_120.000.wav	110.000	120.000	Car
+yESwp_fg0Po_70.000_80.000.wav	70.000	80.000	Car
+yQg3eMb0QKU_30.000_40.000.wav	30.000	40.000	Car
+yQjnNR7fXKo_50.000_60.000.wav	50.000	60.000	Car
+zCuKYr_oMlE_60.000_70.000.wav	60.000	70.000	Car
+zz35Va7tYmA_30.000_40.000.wav	30.000	40.000	Car
+-CZ1LIc8aos_20.000_30.000.wav	20.000	30.000	Car passing by
+-WgJ-M292Yc_30.000_40.000.wav	30.000	40.000	Car passing by
+-iAAxJkoqcM_0.000_6.000.wav	0.000	6.000	Car passing by
+0mQcGLpc8to_30.000_40.000.wav	30.000	40.000	Car passing by
+1HtGgZnlKjU_30.000_40.000.wav	30.000	40.000	Car passing by
+2IsAlhq0XFc_30.000_40.000.wav	30.000	40.000	Car passing by
+2UvEmetE__I_30.000_40.000.wav	30.000	40.000	Car passing by
+2oHGIzH_XzA_30.000_40.000.wav	30.000	40.000	Car passing by
+3mor5mPSYoU_7.000_17.000.wav	7.000	17.000	Car passing by
+8SYLYWR47EE_30.000_40.000.wav	30.000	40.000	Car passing by
+8rzhhvS0tGc_30.000_40.000.wav	30.000	40.000	Car passing by
+8v377AXrgac_30.000_40.000.wav	30.000	40.000	Car passing by
+9lMtTDKyDEk_30.000_40.000.wav	30.000	40.000	Car passing by
+BWoL8oKoTFI_30.000_40.000.wav	30.000	40.000	Car passing by
+BsvD806qNM8_10.000_20.000.wav	10.000	20.000	Car passing by
+C3LLtToB2zA_30.000_40.000.wav	30.000	40.000	Car passing by
+Dk6b9dVD0i8_6.000_16.000.wav	6.000	16.000	Car passing by
+Dw1q9rBv7oU_30.000_40.000.wav	30.000	40.000	Car passing by
+EqFuY_U0Yz0_30.000_40.000.wav	30.000	40.000	Car passing by
+FjpOboRcrNc_10.000_20.000.wav	10.000	20.000	Car passing by
+FjyZV8zIJ0k_30.000_40.000.wav	30.000	40.000	Car passing by
+Fn7eSPVvgCQ_30.000_40.000.wav	30.000	40.000	Car passing by
+G6A-sT2DOjY_30.000_40.000.wav	30.000	40.000	Car passing by
+GBXRuYIvhfM_30.000_40.000.wav	30.000	40.000	Car passing by
+HDEPd5MIaow_30.000_40.000.wav	30.000	40.000	Car passing by
+HQQxGJKg1iM_30.000_40.000.wav	30.000	40.000	Car passing by
+If-V0XO-mpo_30.000_40.000.wav	30.000	40.000	Car passing by
+JtuNiusRRLk_30.000_40.000.wav	30.000	40.000	Car passing by
+M8BFtmQRHq4_200.000_210.000.wav	200.000	210.000	Car passing by
+NKPAwhwZmqs_30.000_40.000.wav	30.000	40.000	Car passing by
+Oa2Os8eOUjs_30.000_40.000.wav	30.000	40.000	Car passing by
+QcLfJE-YfJY_30.000_40.000.wav	30.000	40.000	Car passing by
+SkbFczIabRY_30.000_40.000.wav	30.000	40.000	Car passing by
+VAiH1LX8guk_17.000_27.000.wav	17.000	27.000	Car passing by
+Yc_WuISxfLI_30.000_40.000.wav	30.000	40.000	Car passing by
+Yd10enP9ykM_30.000_40.000.wav	30.000	40.000	Car passing by
+_HGGCwtyNxM_30.000_40.000.wav	30.000	40.000	Car passing by
+a2U10_mi5as_30.000_40.000.wav	30.000	40.000	Car passing by
+aB6FDPKAPus_30.000_40.000.wav	30.000	40.000	Car passing by
+bDFQWubN4x4_30.000_40.000.wav	30.000	40.000	Car passing by
+cW859JAzVZ0_30.000_40.000.wav	30.000	40.000	Car passing by
+dDTvjXXFkDg_30.000_40.000.wav	30.000	40.000	Car passing by
+dfwr8wgZU8M_40.000_50.000.wav	40.000	50.000	Car passing by
+fJLCT3xDGxA_30.000_40.000.wav	30.000	40.000	Car passing by
+gc6VlixMHXE_30.000_40.000.wav	30.000	40.000	Car passing by
+gd_KjDM4fi8_0.000_10.000.wav	0.000	10.000	Car passing by
+j7OEpDiK3IA_30.000_40.000.wav	30.000	40.000	Car passing by
+jZxusrD28rM_30.000_40.000.wav	30.000	40.000	Car passing by
+llZBUsAwRWc_30.000_40.000.wav	30.000	40.000	Car passing by
+m_dCO5bBCic_26.000_36.000.wav	26.000	36.000	Car passing by
+qDQX7Xi3GsQ_30.000_40.000.wav	30.000	40.000	Car passing by
+qxLdv8u_Ujw_0.000_5.000.wav	0.000	5.000	Car passing by
+reP-OOWiLWU_30.000_40.000.wav	30.000	40.000	Car passing by
+s4jG5ZJYCvQ_30.000_40.000.wav	30.000	40.000	Car passing by
+s5s3aR8Z7I8_350.000_360.000.wav	350.000	360.000	Car passing by
+uUyB4q7jgn4_30.000_40.000.wav	30.000	40.000	Car passing by
+vPDXFKcdaS4_0.000_10.000.wav	0.000	10.000	Car passing by
+wD4QouhX8zo_30.000_40.000.wav	30.000	40.000	Car passing by
+xqH9TpH6Xy0_0.000_10.000.wav	0.000	10.000	Car passing by
+zd67ihUZ1u4_25.000_35.000.wav	25.000	35.000	Car passing by
+-3z5mFRgbxc_30.000_40.000.wav	30.000	40.000	Bus
+0N9EN0BEjP0_430.000_440.000.wav	430.000	440.000	Bus
+0lPcHRhXlWk_30.000_40.000.wav	30.000	40.000	Bus
+1E1evA4T_Tk_30.000_40.000.wav	30.000	40.000	Bus
+1hIg-Lsvc7Q_30.000_40.000.wav	30.000	40.000	Bus
+6-yQsEH2WYA_30.000_40.000.wav	30.000	40.000	Bus
+6Y8wSI1l-Lw_30.000_40.000.wav	30.000	40.000	Bus
+7T04388Ijk8_30.000_40.000.wav	30.000	40.000	Bus
+8E7okHnCcTA_30.000_40.000.wav	30.000	40.000	Bus
+8oEdgb8iXYA_1.000_11.000.wav	1.000	11.000	Bus
+AdpNSGX2_Pk_10.000_20.000.wav	10.000	20.000	Bus
+AwJ8orGuOXg_2.000_12.000.wav	2.000	12.000	Bus
+BS1fqEDAvh0_330.000_340.000.wav	330.000	340.000	Bus
+CoFbRc1OxFU_9.000_19.000.wav	9.000	19.000	Bus
+DRqKOlP8BmU_110.000_120.000.wav	110.000	120.000	Bus
+DYcXvyBFc5w_30.000_40.000.wav	30.000	40.000	Bus
+DYdalOQnx1Y_30.000_40.000.wav	30.000	40.000	Bus
+DkwFXd5nYLE_40.000_50.000.wav	40.000	50.000	Bus
+FBMR3pW9H9o_30.000_40.000.wav	30.000	40.000	Bus
+FEGa4e6RAlw_30.000_40.000.wav	30.000	40.000	Bus
+Ge_KWS-0098_30.000_40.000.wav	30.000	40.000	Bus
+HxMoMMrA6Eo_30.000_40.000.wav	30.000	40.000	Bus
+I7esm6vqqZ4_30.000_40.000.wav	30.000	40.000	Bus
+JLj11umr1CE_0.000_10.000.wav	0.000	10.000	Bus
+JwAhcHHF2qg_30.000_40.000.wav	30.000	40.000	Bus
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Bus
+LzZ_nxuZ8Co_30.000_40.000.wav	30.000	40.000	Bus
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Bus
+Nyi9_-u6-w0_30.000_40.000.wav	30.000	40.000	Bus
+O_SKumO328I_30.000_40.000.wav	30.000	40.000	Bus
+Owg_XU9XmRM_30.000_40.000.wav	30.000	40.000	Bus
+P94rcZSuTT8_30.000_40.000.wav	30.000	40.000	Bus
+PP741kd2vRM_30.000_40.000.wav	30.000	40.000	Bus
+Qna9qrV8_go_30.000_40.000.wav	30.000	40.000	Bus
+Qt7FJkuqWPE_30.000_40.000.wav	30.000	40.000	Bus
+UcQ7cVukaxY_21.000_31.000.wav	21.000	31.000	Bus
+W8fIlauyJkk_30.000_40.000.wav	30.000	40.000	Bus
+WDn851XbWTk_30.000_40.000.wav	30.000	40.000	Bus
+WvquSD2PcCE_30.000_40.000.wav	30.000	40.000	Bus
+a9B_HA3y8WQ_30.000_40.000.wav	30.000	40.000	Bus
+cEEoKQ38fHY_30.000_40.000.wav	30.000	40.000	Bus
+er1vQ-nse_g_30.000_40.000.wav	30.000	40.000	Bus
+fLvM4bbpg6w_0.000_10.000.wav	0.000	10.000	Bus
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Bus
+gxVhAVNjSU0_30.000_40.000.wav	30.000	40.000	Bus
+jaSK_t8QP1E_30.000_40.000.wav	30.000	40.000	Bus
+ji_YCMygNHQ_8.000_18.000.wav	8.000	18.000	Bus
+kNKfoDp0uUw_30.000_40.000.wav	30.000	40.000	Bus
+kdDgTDfo9HY_100.000_110.000.wav	100.000	110.000	Bus
+lHP0q2sQzPQ_30.000_40.000.wav	30.000	40.000	Bus
+mGG8rop4Jig_30.000_40.000.wav	30.000	40.000	Bus
+oHKTmTLEy68_11.000_21.000.wav	11.000	21.000	Bus
+tAfucDIyRiM_30.000_40.000.wav	30.000	40.000	Bus
+tQd0vFueRKs_30.000_40.000.wav	30.000	40.000	Bus
+ucICmff0K-Q_30.000_40.000.wav	30.000	40.000	Bus
+x-2Abohj8VY_30.000_40.000.wav	30.000	40.000	Bus
+xFr2xX6PulQ_70.000_80.000.wav	70.000	80.000	Bus
+yfSBqp5IZSM_10.000_20.000.wav	10.000	20.000	Bus
+-2sE5CH8Wb8_30.000_40.000.wav	30.000	40.000	Truck
+-BY64_p-vtM_30.000_40.000.wav	30.000	40.000	Truck
+-fJsZm3YRc0_30.000_40.000.wav	30.000	40.000	Truck
+-t-htrAtNvM_30.000_40.000.wav	30.000	40.000	Truck
+-zNEcuo28oE_30.000_40.000.wav	30.000	40.000	Truck
+01WuUBxFBp4_30.000_40.000.wav	30.000	40.000	Truck
+077aWlQn6XI_30.000_40.000.wav	30.000	40.000	Truck
+0Ga7T-2e490_17.000_27.000.wav	17.000	27.000	Truck
+0N9EN0BEjP0_430.000_440.000.wav	430.000	440.000	Truck
+10aF24rMeu0_30.000_40.000.wav	30.000	40.000	Truck
+2HZcxlRs-hg_30.000_40.000.wav	30.000	40.000	Truck
+2Jpg_KvJWL0_30.000_40.000.wav	30.000	40.000	Truck
+2Tmi7EqpGZQ_0.000_10.000.wav	0.000	10.000	Truck
+4DlKNmVcoek_20.000_30.000.wav	20.000	30.000	Truck
+4MRzQbAIyV4_90.000_100.000.wav	90.000	100.000	Truck
+4Tpy1lsfcSM_30.000_40.000.wav	30.000	40.000	Truck
+4ep09nZl3LA_30.000_40.000.wav	30.000	40.000	Truck
+5DW8WjxxCag_30.000_40.000.wav	30.000	40.000	Truck
+5DjZHCumLfs_11.000_21.000.wav	11.000	21.000	Truck
+5QP1Tc3XbDc_30.000_40.000.wav	30.000	40.000	Truck
+5V0xKS-FGMk_30.000_40.000.wav	30.000	40.000	Truck
+5fLzQegwHUg_30.000_40.000.wav	30.000	40.000	Truck
+6HL_DKWK-WA_10.000_20.000.wav	10.000	20.000	Truck
+6VQGk8IrV-4_30.000_40.000.wav	30.000	40.000	Truck
+6Y8bKS6KLeE_30.000_40.000.wav	30.000	40.000	Truck
+6xEHP-C-ZuU_30.000_40.000.wav	30.000	40.000	Truck
+6yyToq9cW9A_60.000_70.000.wav	60.000	70.000	Truck
+7Gua0-UrKIw_30.000_40.000.wav	30.000	40.000	Truck
+7nglQSmcjAk_30.000_40.000.wav	30.000	40.000	Truck
+81DteAPIhoE_30.000_40.000.wav	30.000	40.000	Truck
+84E9i9_ELBs_30.000_40.000.wav	30.000	40.000	Truck
+8jblPMBafKE_30.000_40.000.wav	30.000	40.000	Truck
+8k17D6qiuqI_30.000_40.000.wav	30.000	40.000	Truck
+9EsgN-WS2qY_30.000_40.000.wav	30.000	40.000	Truck
+9LJnjmcRcb8_280.000_290.000.wav	280.000	290.000	Truck
+9yhMtJ50sys_30.000_40.000.wav	30.000	40.000	Truck
+A9KMqwqLboE_30.000_40.000.wav	30.000	40.000	Truck
+ARIVxBOc0BQ_40.000_50.000.wav	40.000	50.000	Truck
+AwFuGITwrms_30.000_40.000.wav	30.000	40.000	Truck
+BQVXzH6YK8g_30.000_40.000.wav	30.000	40.000	Truck
+CnYWJp2bknU_50.000_60.000.wav	50.000	60.000	Truck
+DRqKOlP8BmU_110.000_120.000.wav	110.000	120.000	Truck
+DXlTakKvLzg_30.000_40.000.wav	30.000	40.000	Truck
+DkVfro9iq80_30.000_40.000.wav	30.000	40.000	Truck
+Dmy4EjohxxU_60.000_70.000.wav	60.000	70.000	Truck
+DvMFQ64YwcI_30.000_40.000.wav	30.000	40.000	Truck
+FEoMTMxzn3U_30.000_40.000.wav	30.000	40.000	Truck
+GTk_6JDmtCY_230.000_240.000.wav	230.000	240.000	Truck
+HDEPd5MIaow_30.000_40.000.wav	30.000	40.000	Truck
+HQkLVac7z9Q_70.000_80.000.wav	70.000	80.000	Truck
+I4VDcVTE4YA_30.000_40.000.wav	30.000	40.000	Truck
+IxlvxvG8zOE_110.000_120.000.wav	110.000	120.000	Truck
+JLzD44Im1Ec_30.000_40.000.wav	30.000	40.000	Truck
+K4Hcb00hTTY_30.000_40.000.wav	30.000	40.000	Truck
+L2M3xanqQP8_30.000_40.000.wav	30.000	40.000	Truck
+LA5TekLaIPI_10.000_20.000.wav	10.000	20.000	Truck
+LhRNnXaSsCk_30.000_40.000.wav	30.000	40.000	Truck
+MWTTe0M9vi4_30.000_40.000.wav	30.000	40.000	Truck
+Nkqx09b-xyI_70.000_80.000.wav	70.000	80.000	Truck
+NqzZbJJl3E4_30.000_40.000.wav	30.000	40.000	Truck
+OPd0cz1hRqc_30.000_40.000.wav	30.000	40.000	Truck
+PCl-q7lCT_U_50.000_60.000.wav	50.000	60.000	Truck
+PNaLTW50fxM_60.000_70.000.wav	60.000	70.000	Truck
+PO1eaJ7tQOg_180.000_190.000.wav	180.000	190.000	Truck
+PSt0xAYgf4g_0.000_10.000.wav	0.000	10.000	Truck
+Pef6g19i5iI_30.000_40.000.wav	30.000	40.000	Truck
+Q1CMSV81_ws_30.000_40.000.wav	30.000	40.000	Truck
+SiBIYAiIajM_30.000_40.000.wav	30.000	40.000	Truck
+T6oYCFRafPs_30.000_40.000.wav	30.000	40.000	Truck
+WdubBeFntYQ_460.000_470.000.wav	460.000	470.000	Truck
+_ZiJA6phEq8_30.000_40.000.wav	30.000	40.000	Truck
+_jfv_ziZWII_60.000_70.000.wav	60.000	70.000	Truck
+acvV6yYNc7Y_30.000_40.000.wav	30.000	40.000	Truck
+bQSaQ0iX_vk_30.000_40.000.wav	30.000	40.000	Truck
+bhxN5w03yS0_30.000_40.000.wav	30.000	40.000	Truck
+ckt7YEGcSoY_30.000_40.000.wav	30.000	40.000	Truck
+eIkUuCRE_0U_30.000_40.000.wav	30.000	40.000	Truck
+gxVhAVNjSU0_30.000_40.000.wav	30.000	40.000	Truck
+hDVNQOJCvOk_30.000_40.000.wav	30.000	40.000	Truck
+ieZVo7W3BQ4_30.000_40.000.wav	30.000	40.000	Truck
+ikmE_kRvDAc_30.000_40.000.wav	30.000	40.000	Truck
+jwZTKNsbf58_70.000_80.000.wav	70.000	80.000	Truck
+kH6fFjIZkB0_30.000_40.000.wav	30.000	40.000	Truck
+kr8ssbrDDMY_30.000_40.000.wav	30.000	40.000	Truck
+lp66EaEOOoU_30.000_40.000.wav	30.000	40.000	Truck
+n4o1r8Ai66o_30.000_40.000.wav	30.000	40.000	Truck
+nDtrUUc2J2U_0.000_10.000.wav	0.000	10.000	Truck
+nMaSkwx6cHE_30.000_40.000.wav	30.000	40.000	Truck
+p70IcMwsW9M_30.000_40.000.wav	30.000	40.000	Truck
+pJ1fore8JbQ_30.000_40.000.wav	30.000	40.000	Truck
+pt-J_L-OFI8_0.000_10.000.wav	0.000	10.000	Truck
+rdanJP7Usrg_30.000_40.000.wav	30.000	40.000	Truck
+srTX18ikXkE_10.000_20.000.wav	10.000	20.000	Truck
+tuplsUUDXKw_30.000_40.000.wav	30.000	40.000	Truck
+x6vuWsdeS3s_30.000_40.000.wav	30.000	40.000	Truck
+xMClk12ouB8_30.000_40.000.wav	30.000	40.000	Truck
+ycqDMKTrvLY_30.000_40.000.wav	30.000	40.000	Truck
+yk5LqHTtHLo_30.000_40.000.wav	30.000	40.000	Truck
+yrscqyUOIlI_30.000_40.000.wav	30.000	40.000	Truck
+zM3chsL-B7U_30.000_40.000.wav	30.000	40.000	Truck
+06si40RVDco_30.000_40.000.wav	30.000	40.000	Motorcycle
+0DzsPL-xElE_20.000_30.000.wav	20.000	30.000	Motorcycle
+145N68nh4m0_120.000_130.000.wav	120.000	130.000	Motorcycle
+16vw4K9qJnY_30.000_40.000.wav	30.000	40.000	Motorcycle
+21QlKF17ipc_30.000_40.000.wav	30.000	40.000	Motorcycle
+3LulQoOXNB0_30.000_40.000.wav	30.000	40.000	Motorcycle
+45JHcLU57B8_20.000_30.000.wav	20.000	30.000	Motorcycle
+4NZkW-XaIa4_30.000_40.000.wav	30.000	40.000	Motorcycle
+506I6LfdDuk_50.000_60.000.wav	50.000	60.000	Motorcycle
+6MCy1lh4qaw_20.000_30.000.wav	20.000	30.000	Motorcycle
+6R8cO4ARzkY_30.000_40.000.wav	30.000	40.000	Motorcycle
+6taAP7SFewI_30.000_40.000.wav	30.000	40.000	Motorcycle
+7g6aZTBe2xE_30.000_40.000.wav	30.000	40.000	Motorcycle
+9HcahqYUVoc_90.000_100.000.wav	90.000	100.000	Motorcycle
+9N1iw5Vdim8_20.000_30.000.wav	20.000	30.000	Motorcycle
+ANWU9Hiy_5k_40.000_50.000.wav	40.000	50.000	Motorcycle
+BTNz6NftP34_30.000_40.000.wav	30.000	40.000	Motorcycle
+BxnLAGsByCI_10.000_20.000.wav	10.000	20.000	Motorcycle
+CZgx_6XaEkg_30.000_40.000.wav	30.000	40.000	Motorcycle
+D3BJuOwltoI_10.000_20.000.wav	10.000	20.000	Motorcycle
+FgN9v1jYqjA_30.000_40.000.wav	30.000	40.000	Motorcycle
+HQ8eR2lvjSE_30.000_40.000.wav	30.000	40.000	Motorcycle
+Mb-GyQEKoEc_30.000_40.000.wav	30.000	40.000	Motorcycle
+Pair_NsHdTc_30.000_40.000.wav	30.000	40.000	Motorcycle
+UFIBEBkm7ao_30.000_40.000.wav	30.000	40.000	Motorcycle
+UWz5OIijWM4_30.000_40.000.wav	30.000	40.000	Motorcycle
+WLX3Db60418_20.000_30.000.wav	20.000	30.000	Motorcycle
+X5Xs8Y1cJK0_30.000_40.000.wav	30.000	40.000	Motorcycle
+ZGf0vrZStwI_30.000_40.000.wav	30.000	40.000	Motorcycle
+ZfkO1HlI0zM_30.000_40.000.wav	30.000	40.000	Motorcycle
+bhtB2Zgh9Q8_110.000_120.000.wav	110.000	120.000	Motorcycle
+d-m8eXCpeDg_30.000_40.000.wav	30.000	40.000	Motorcycle
+d21IwtH2oHI_30.000_40.000.wav	30.000	40.000	Motorcycle
+dhaKGPCgtfw_30.000_40.000.wav	30.000	40.000	Motorcycle
+ee-0JGvEIng_30.000_40.000.wav	30.000	40.000	Motorcycle
+epGDNMrsQb8_40.000_50.000.wav	40.000	50.000	Motorcycle
+ezUkPETm6cs_30.000_40.000.wav	30.000	40.000	Motorcycle
+f724u5z_UDw_30.000_40.000.wav	30.000	40.000	Motorcycle
+gGmWm1i6pVo_30.000_40.000.wav	30.000	40.000	Motorcycle
+i9VjpIbM3iE_410.000_420.000.wav	410.000	420.000	Motorcycle
+iMp8nODaotA_580.000_590.000.wav	580.000	590.000	Motorcycle
+lVW2CqsHJ4Y_30.000_40.000.wav	30.000	40.000	Motorcycle
+lj7hzmz19-M_30.000_40.000.wav	30.000	40.000	Motorcycle
+mX45CiTjf8I_30.000_40.000.wav	30.000	40.000	Motorcycle
+mbLiZ_jpgeY_20.000_30.000.wav	20.000	30.000	Motorcycle
+owZDBEq6WdU_30.000_40.000.wav	30.000	40.000	Motorcycle
+pNMBIqvbyB4_30.000_40.000.wav	30.000	40.000	Motorcycle
+po-tnKZAzdg_40.000_50.000.wav	40.000	50.000	Motorcycle
+qAQuljp-atA_30.000_40.000.wav	30.000	40.000	Motorcycle
+r0Oll28wmXs_30.000_40.000.wav	30.000	40.000	Motorcycle
+sAMjMyCdGOc_30.000_40.000.wav	30.000	40.000	Motorcycle
+vHlqKDR7ggA_30.000_40.000.wav	30.000	40.000	Motorcycle
+wPfv8ifzzyg_30.000_40.000.wav	30.000	40.000	Motorcycle
+wyhurCZbKQU_30.000_40.000.wav	30.000	40.000	Motorcycle
+xQTPEQDb0Gg_30.000_40.000.wav	30.000	40.000	Motorcycle
+xTPmoYwgKf4_30.000_40.000.wav	30.000	40.000	Motorcycle
+xXGIKM4daMU_30.000_40.000.wav	30.000	40.000	Motorcycle
+xZ8hQliZqhg_160.000_170.000.wav	160.000	170.000	Motorcycle
+xuMBy2NoROI_30.000_40.000.wav	30.000	40.000	Motorcycle
+z_8yGVO1qws_30.000_40.000.wav	30.000	40.000	Motorcycle
+-BaVEk1zS2g_50.000_60.000.wav	50.000	60.000	Train
+-Q4fBQ4egrs_0.000_10.000.wav	0.000	10.000	Train
+-QxSFr1cYuQ_20.000_30.000.wav	20.000	30.000	Train
+-ZdReI9dL6M_530.000_540.000.wav	530.000	540.000	Train
+0YIyGEM0yG0_550.000_560.000.wav	550.000	560.000	Train
+1Mk2MJDhLJQ_20.000_30.000.wav	20.000	30.000	Train
+2nejPPEWqJ8_320.000_330.000.wav	320.000	330.000	Train
+3ACjUf9QpAQ_30.000_40.000.wav	30.000	40.000	Train
+3RfrTU1p5SA_500.000_510.000.wav	500.000	510.000	Train
+3YJewEC-NWo_30.000_40.000.wav	30.000	40.000	Train
+3ZZDuYU2HM4_150.000_160.000.wav	150.000	160.000	Train
+3fPX1LaGwJo_60.000_70.000.wav	60.000	70.000	Train
+4_gyCWuPxRg_170.000_180.000.wav	170.000	180.000	Train
+4l4vGrMD4Tw_550.000_560.000.wav	550.000	560.000	Train
+4oT0bxldS80_30.000_40.000.wav	30.000	40.000	Train
+4t7Mi3pnSA4_210.000_220.000.wav	210.000	220.000	Train
+53oq_Otm_XI_30.000_40.000.wav	30.000	40.000	Train
+6OgSNQOTw2U_30.000_40.000.wav	30.000	40.000	Train
+6_TGlFO0DCk_10.000_20.000.wav	10.000	20.000	Train
+7KdSGBzXvz8_420.000_430.000.wav	420.000	430.000	Train
+7W_kcu0CJqI_310.000_320.000.wav	310.000	320.000	Train
+8IaInXpdd9M_0.000_10.000.wav	0.000	10.000	Train
+8nU1aVscJec_30.000_40.000.wav	30.000	40.000	Train
+9LQEZJPNVpw_30.000_40.000.wav	30.000	40.000	Train
+9NT6gEiqpWA_30.000_40.000.wav	30.000	40.000	Train
+AFhll08KM98_30.000_40.000.wav	30.000	40.000	Train
+AHom7lBbtoY_30.000_40.000.wav	30.000	40.000	Train
+AK0kZUDk294_2.000_12.000.wav	2.000	12.000	Train
+AKPC4rEGoyI_30.000_40.000.wav	30.000	40.000	Train
+APsvUzw7bWA_60.000_70.000.wav	60.000	70.000	Train
+AshwkKUV07s_23.000_33.000.wav	23.000	33.000	Train
+BI2Tol64na0_30.000_40.000.wav	30.000	40.000	Train
+BmS2NiuT2c0_160.000_170.000.wav	160.000	170.000	Train
+CCX_4cW_SAU_0.000_10.000.wav	0.000	10.000	Train
+D_nXtMgbPNY_30.000_40.000.wav	30.000	40.000	Train
+F-JFxERdA2w_30.000_40.000.wav	30.000	40.000	Train
+FoIBRxw0tyE_30.000_40.000.wav	30.000	40.000	Train
+G958vjLYBcI_110.000_120.000.wav	110.000	120.000	Train
+GFQnh84kNwU_30.000_40.000.wav	30.000	40.000	Train
+GKc8PCTen8Q_310.000_320.000.wav	310.000	320.000	Train
+I4qODX0fypE_30.000_40.000.wav	30.000	40.000	Train
+IIIxN_ziy_I_60.000_70.000.wav	60.000	70.000	Train
+IdqEbjujFb8_30.000_40.000.wav	30.000	40.000	Train
+K-i81KrH8BQ_30.000_40.000.wav	30.000	40.000	Train
+K9pSRLw6FNc_40.000_50.000.wav	40.000	50.000	Train
+KPyYUly5xCc_90.000_100.000.wav	90.000	100.000	Train
+L3a132_uApg_50.000_60.000.wav	50.000	60.000	Train
+LK4b2eJpy24_30.000_40.000.wav	30.000	40.000	Train
+LzcNa3HvD7c_30.000_40.000.wav	30.000	40.000	Train
+MCYY8tJsnfY_7.000_17.000.wav	7.000	17.000	Train
+MDF2vsjm8jU_10.000_20.000.wav	10.000	20.000	Train
+MMfiWJVftMA_60.000_70.000.wav	60.000	70.000	Train
+MYzVHespZ-E_30.000_40.000.wav	30.000	40.000	Train
+Mbe4rlNiM84_0.000_7.000.wav	0.000	7.000	Train
+MczH_PWBNeI_360.000_370.000.wav	360.000	370.000	Train
+Mfkif49LLc4_30.000_40.000.wav	30.000	40.000	Train
+MwSbYICrYj8_290.000_300.000.wav	290.000	300.000	Train
+PJUy17bXlhc_40.000_50.000.wav	40.000	50.000	Train
+QDTbchu0LrU_30.000_40.000.wav	30.000	40.000	Train
+QZJ5WAYIUh8_70.000_80.000.wav	70.000	80.000	Train
+QrAoRSA13bM_30.000_40.000.wav	30.000	40.000	Train
+RN-_agT8_Cg_0.000_10.000.wav	0.000	10.000	Train
+R_Lpb-51Kl4_30.000_40.000.wav	30.000	40.000	Train
+Rhvy7V4F95Q_40.000_50.000.wav	40.000	50.000	Train
+Rq-22Cycrpg_30.000_40.000.wav	30.000	40.000	Train
+RrlgSfQrqQc_20.000_30.000.wav	20.000	30.000	Train
+RwBKGPEg6uA_340.000_350.000.wav	340.000	350.000	Train
+T73runykdnE_25.000_35.000.wav	25.000	35.000	Train
+T8M6W4yOzI4_30.000_40.000.wav	30.000	40.000	Train
+Tmm4H6alHCE_30.000_40.000.wav	30.000	40.000	Train
+TyTORMEourg_270.000_280.000.wav	270.000	280.000	Train
+UQx0EMXtLZA_60.000_70.000.wav	60.000	70.000	Train
+UZx7OAgRMRY_90.000_100.000.wav	90.000	100.000	Train
+UerX5Bv2hcs_70.000_80.000.wav	70.000	80.000	Train
+UxSUGCvpskM_340.000_350.000.wav	340.000	350.000	Train
+V2hln47cP78_130.000_140.000.wav	130.000	140.000	Train
+VIe_Qkg5RJI_130.000_140.000.wav	130.000	140.000	Train
+WDn851XbWTk_30.000_40.000.wav	30.000	40.000	Train
+WFdpQCtpBB4_30.000_40.000.wav	30.000	40.000	Train
+XAUtk9lwzU8_30.000_40.000.wav	30.000	40.000	Train
+XDTlBb3aYqo_30.000_40.000.wav	30.000	40.000	Train
+XKvLkIM8dck_40.000_50.000.wav	40.000	50.000	Train
+XQbeLJYzY9k_90.000_100.000.wav	90.000	100.000	Train
+XW8pSKLyr0o_20.000_30.000.wav	20.000	30.000	Train
+XeYiNanFS_M_120.000_130.000.wav	120.000	130.000	Train
+Y10I9JSvJuQ_30.000_40.000.wav	30.000	40.000	Train
+YDGf-razgyU_250.000_260.000.wav	250.000	260.000	Train
+YFD1Qrlskrg_60.000_70.000.wav	60.000	70.000	Train
+Y_jwEflLthg_190.000_200.000.wav	190.000	200.000	Train
+Y_ynIwm3qm0_370.000_380.000.wav	370.000	380.000	Train
+Zy0goYEHPHU_30.000_40.000.wav	30.000	40.000	Train
+_dkeW6lqmq4_30.000_40.000.wav	30.000	40.000	Train
+aNO2KEXBCOk_30.000_40.000.wav	30.000	40.000	Train
+aXsUHAKbyLs_30.000_40.000.wav	30.000	40.000	Train
+ahct5yzUtdE_20.000_30.000.wav	20.000	30.000	Train
+arevYmB0qGg_30.000_40.000.wav	30.000	40.000	Train
+bCGtzspNbNo_30.000_40.000.wav	30.000	40.000	Train
+bI6wPI9kAm8_70.000_80.000.wav	70.000	80.000	Train
+bpdCMWWiB_0_30.000_40.000.wav	30.000	40.000	Train
+cdrjKqyDrak_420.000_430.000.wav	420.000	430.000	Train
+d1o334I5X_k_30.000_40.000.wav	30.000	40.000	Train
+dSzZWgbJ378_30.000_40.000.wav	30.000	40.000	Train
+eRclX9l0F_c_150.000_160.000.wav	150.000	160.000	Train
+fOVsAMJ3Yms_30.000_40.000.wav	30.000	40.000	Train
+fWVfi9pAh_4_10.000_20.000.wav	10.000	20.000	Train
+fztkF47lVQg_0.000_10.000.wav	0.000	10.000	Train
+g0ICxHjC9Uc_30.000_40.000.wav	30.000	40.000	Train
+g2scd3YVgwQ_30.000_40.000.wav	30.000	40.000	Train
+g4cA-ifQc70_30.000_40.000.wav	30.000	40.000	Train
+g9JVq7wfDIo_30.000_40.000.wav	30.000	40.000	Train
+gKMpowHeyKc_30.000_40.000.wav	30.000	40.000	Train
+gTFCK9TuLOQ_30.000_40.000.wav	30.000	40.000	Train
+gU0mD2fSh4c_500.000_510.000.wav	500.000	510.000	Train
+gkH_Zxasn8o_40.000_50.000.wav	40.000	50.000	Train
+gvnM4kK4r70_10.000_20.000.wav	10.000	20.000	Train
+hH_M56EnnDk_30.000_40.000.wav	30.000	40.000	Train
+hVvtTC9AmNs_30.000_40.000.wav	30.000	40.000	Train
+hYqzr_rIIAw_30.000_40.000.wav	30.000	40.000	Train
+hdYQzH2E-e4_310.000_320.000.wav	310.000	320.000	Train
+iZgzRfa-xPQ_30.000_40.000.wav	30.000	40.000	Train
+j9Z63H5hvrQ_0.000_10.000.wav	0.000	10.000	Train
+jbW2ew8VMfU_50.000_60.000.wav	50.000	60.000	Train
+jlz7r-NSUuA_50.000_60.000.wav	50.000	60.000	Train
+k0vRZm7ZnQk_280.000_290.000.wav	280.000	290.000	Train
+k8H8rn4NaSM_0.000_10.000.wav	0.000	10.000	Train
+kbfkq3TuAe0_470.000_480.000.wav	470.000	480.000	Train
+lf1Sblrda3A_560.000_570.000.wav	560.000	570.000	Train
+m4DS9-5Gkds_30.000_40.000.wav	30.000	40.000	Train
+m5HeCy87QYY_380.000_390.000.wav	380.000	390.000	Train
+nKM4MUAsVzg_100.000_110.000.wav	100.000	110.000	Train
+nY1gcEMzsWI_10.000_20.000.wav	10.000	20.000	Train
+nfY_zkJceDw_30.000_40.000.wav	30.000	40.000	Train
+oogrnx-_LBA_60.000_70.000.wav	60.000	70.000	Train
+pW5SI1ZKUpA_30.000_40.000.wav	30.000	40.000	Train
+pbOZLMrJy0A_0.000_10.000.wav	0.000	10.000	Train
+pxmrmtEnROk_30.000_40.000.wav	30.000	40.000	Train
+q7zzKHFWGkg_30.000_40.000.wav	30.000	40.000	Train
+qu8vVFWKszA_30.000_40.000.wav	30.000	40.000	Train
+r6mHSfFkY_8_30.000_40.000.wav	30.000	40.000	Train
+rNNPQ9DD4no_30.000_40.000.wav	30.000	40.000	Train
+rSrBDAgLUoI_460.000_470.000.wav	460.000	470.000	Train
+stdjjG6Y5IU_30.000_40.000.wav	30.000	40.000	Train
+t_lFhyZaZR0_150.000_160.000.wav	150.000	160.000	Train
+txXSE7kgrc8_30.000_40.000.wav	30.000	40.000	Train
+uZfsEDo3elY_20.000_30.000.wav	20.000	30.000	Train
+umcnfA9veOw_160.000_170.000.wav	160.000	170.000	Train
+uysTr0SfhLI_10.000_20.000.wav	10.000	20.000	Train
+wM9wNgY8d4g_150.000_160.000.wav	150.000	160.000	Train
+xabrKa79prM_30.000_40.000.wav	30.000	40.000	Train
+xshKOSEF_6o_0.000_10.000.wav	0.000	10.000	Train
+yBVxtq9k8Sg_0.000_10.000.wav	0.000	10.000	Train
+yH1r2Bblluw_240.000_250.000.wav	240.000	250.000	Train
+yywGJu6jp8U_30.000_40.000.wav	30.000	40.000	Train
+z5uKFGeTtNg_30.000_40.000.wav	30.000	40.000	Train
diff --git a/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv
new file mode 100644
index 0000000..746bd3f
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/black_list/groundtruth_weak_label_testing_set.csv
@@ -0,0 +1,606 @@
+-5QrBL6MzLg_60.000_70.000.wav	60.000	70.000	Train horn
+-E0shPRxAbo_30.000_40.000.wav	30.000	40.000	Train horn
+-GCwoyCnYsY_0.000_10.000.wav	0.000	10.000	Train horn
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Train horn
+-Qfk_Q2ctBs_30.000_40.000.wav	30.000	40.000	Train horn
+-Wd1pV7UjWg_60.000_70.000.wav	60.000	70.000	Train horn
+-Zq22n4OewA_30.000_40.000.wav	30.000	40.000	Train horn
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Train horn
+-nGBPqlRNg4_30.000_40.000.wav	30.000	40.000	Train horn
+-u9BxBNcrw4_30.000_40.000.wav	30.000	40.000	Train horn
+-zqW9xCZd80_260.000_270.000.wav	260.000	270.000	Train horn
+02w3vd_GgF0_390.000_400.000.wav	390.000	400.000	Train horn
+0HqeYIREv8M_30.000_40.000.wav	30.000	40.000	Train horn
+0IpYF91Fdt0_80.000_90.000.wav	80.000	90.000	Train horn
+0NaZejdABG0_90.000_100.000.wav	90.000	100.000	Train horn
+0RurXUfKyow_4.000_14.000.wav	4.000	14.000	Train horn
+0_HnD-rW3lI_170.000_180.000.wav	170.000	180.000	Train horn
+10i60V1RZkQ_210.000_220.000.wav	210.000	220.000	Train horn
+1FJY5X1iY9I_170.000_180.000.wav	170.000	180.000	Train horn
+1S5WKCcf-wU_40.000_50.000.wav	40.000	50.000	Train horn
+1U0Ty6CW6AM_40.000_50.000.wav	40.000	50.000	Train horn
+1hQLr88iCvg_30.000_40.000.wav	30.000	40.000	Train horn
+1iUXERALOOs_190.000_200.000.wav	190.000	200.000	Train horn
+1iWFlLpixKU_5.000_15.000.wav	5.000	15.000	Train horn
+1oJAVJPX0YY_20.000_30.000.wav	20.000	30.000	Train horn
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Train horn
+2BMHsKLcb7E_90.000_100.000.wav	90.000	100.000	Train horn
+2RpOd9MJjyQ_10.000_20.000.wav	10.000	20.000	Train horn
+2U4wSdl10to_200.000_210.000.wav	200.000	210.000	Train horn
+2aBV6AZt5nk_570.000_580.000.wav	570.000	580.000	Train horn
+-8baTnilyjs_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-jG26jT3fP8_230.000_240.000.wav	230.000	240.000	Air horn, truck horn
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Air horn, truck horn
+-v7cUxke-f4_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-yeWlsEpcpA_15.000_25.000.wav	15.000	25.000	Air horn, truck horn
+04KOunVOkSA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+08y2LHhxmsM_400.000_410.000.wav	400.000	410.000	Air horn, truck horn
+0G73yqtBwgE_11.000_21.000.wav	11.000	21.000	Air horn, truck horn
+0UPY7ws-VFs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+0euD32aKYUs_10.000_20.000.wav	10.000	20.000	Air horn, truck horn
+1T1i2rny8RU_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1iRgwn7p0DA_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1myTsHAIvYc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+1z0XoG6GEv4_420.000_430.000.wav	420.000	430.000	Air horn, truck horn
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Air horn, truck horn
+2KmSuPb9gwA_24.000_34.000.wav	24.000	34.000	Air horn, truck horn
+2Vy5NCEkg2I_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+2ZciT0XrifM_0.000_8.000.wav	0.000	8.000	Air horn, truck horn
+2jOzX06bzuA_16.000_26.000.wav	16.000	26.000	Air horn, truck horn
+35EOmSMTQ6I_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Air horn, truck horn
+3ntFslTK6hM_90.000_100.000.wav	90.000	100.000	Air horn, truck horn
+3rGOv4evODE_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+42U7xIucU68_20.000_30.000.wav	20.000	30.000	Air horn, truck horn
+46r7mO2k6zY_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4EBnb2DN3Yg_13.000_23.000.wav	13.000	23.000	Air horn, truck horn
+4NTjS5pFfSc_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4bvfOnX7BIE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+4l78f9VZ9uE_30.000_40.000.wav	30.000	40.000	Air horn, truck horn
+-ajCLjpfGKI_83.000_93.000.wav	83.000	93.000	Car alarm
+-hLSc9aPOms_13.000_23.000.wav	13.000	23.000	Car alarm
+-rgDWfvxxqw_30.000_40.000.wav	30.000	40.000	Car alarm
+0C3kqtF76t8_50.000_60.000.wav	50.000	60.000	Car alarm
+0Hz4R_m0hmI_80.000_90.000.wav	80.000	90.000	Car alarm
+0ZPafgZftWk_80.000_90.000.wav	80.000	90.000	Car alarm
+0npLQ4LzD0c_40.000_50.000.wav	40.000	50.000	Car alarm
+17VuPl9Wxvs_20.000_30.000.wav	20.000	30.000	Car alarm
+3HxQ83IMyw4_70.000_80.000.wav	70.000	80.000	Car alarm
+3z05luLEc_Q_0.000_10.000.wav	0.000	10.000	Car alarm
+4A1Ar1TIXIY_30.000_40.000.wav	30.000	40.000	Car alarm
+4Kpklmj-ze0_53.000_63.000.wav	53.000	63.000	Car alarm
+4h01lBkTVQY_18.000_28.000.wav	18.000	28.000	Car alarm
+5-SzZotiaBU_30.000_40.000.wav	30.000	40.000	Car alarm
+54PbkldEp9M_30.000_40.000.wav	30.000	40.000	Car alarm
+5P6YYsMaIH4_30.000_40.000.wav	30.000	40.000	Car alarm
+5tzTahLHylw_70.000_80.000.wav	70.000	80.000	Car alarm
+7DC3HtNi4fU_160.000_170.000.wav	160.000	170.000	Car alarm
+7NJ5TbNEIvA_250.000_260.000.wav	250.000	260.000	Car alarm
+7NZ0kMj2HSI_54.000_64.000.wav	54.000	64.000	Car alarm
+7RQpt1_1ZzU_30.000_40.000.wav	30.000	40.000	Car alarm
+7ee54nr6jG8_30.000_40.000.wav	30.000	40.000	Car alarm
+8OajsyPSNt8_40.000_50.000.wav	40.000	50.000	Car alarm
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car alarm
+9fzeD7CeI7Y_110.000_120.000.wav	110.000	120.000	Car alarm
+9jYv9WuyknA_130.000_140.000.wav	130.000	140.000	Car alarm
+A-GNszKtjJc_93.000_103.000.wav	93.000	103.000	Car alarm
+A437a4Y_xag_230.000_240.000.wav	230.000	240.000	Car alarm
+APMPW2YI-Zk_20.000_30.000.wav	20.000	30.000	Car alarm
+AR-KmtlXg4Y_70.000_80.000.wav	70.000	80.000	Car alarm
+-60XojQWWoc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-6d-zxMvC5E_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-6qSMlbJJ58_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-8OITuFZha8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-8n2NqDFRko_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AIrHVeCgtM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AVzYvKHwPg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-AXDeY-N2_M_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-B1uzsLG0Dk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-BM_EAszxBg_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-Em3OpyaefM_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-FWkB2IDMhc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-SP7KWmTRUU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-h4or05bj_I_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-oV6dQu5tZo_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-r8mfjRiHrU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-s9kwrRilOY_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-uMiGr6xvRA_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-x70B12Mb-8_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-xYsfYZOI-Y_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-zxrdL6MlKI_30.000_40.000.wav	30.000	40.000	Reversing beeps
+03xMfqt4fZI_24.000_34.000.wav	24.000	34.000	Reversing beeps
+0E4AqW9dmdk_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0FQo-2xRJ0E_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0HmiH-wKLB4_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0KskqFt3DoY_15.000_25.000.wav	15.000	25.000	Reversing beeps
+0OiPtV9sd_w_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0P-YGHC5cBU_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0QKet-tdquc_30.000_40.000.wav	30.000	40.000	Reversing beeps
+0VnoYVqd-yo_30.000_40.000.wav	30.000	40.000	Reversing beeps
+-5px8DVPl8A_28.000_38.000.wav	28.000	38.000	Bicycle
+-D08wyQwDPQ_10.000_20.000.wav	10.000	20.000	Bicycle
+-F1_Gh78vJ0_30.000_40.000.wav	30.000	40.000	Bicycle
+-FZQIkX44Pk_10.000_20.000.wav	10.000	20.000	Bicycle
+-FsvS99nWTc_30.000_40.000.wav	30.000	40.000	Bicycle
+-Holdef_BZ0_30.000_40.000.wav	30.000	40.000	Bicycle
+-Inn26beF70_30.000_40.000.wav	30.000	40.000	Bicycle
+-Jq9HNSs_ns_14.000_24.000.wav	14.000	24.000	Bicycle
+-KlN_AXMM0Q_30.000_40.000.wav	30.000	40.000	Bicycle
+-NCcqKWiGus_30.000_40.000.wav	30.000	40.000	Bicycle
+-NNC_TqWfGw_30.000_40.000.wav	30.000	40.000	Bicycle
+-OGFiXvmldM_30.000_40.000.wav	30.000	40.000	Bicycle
+-RFpDUZhN-g_13.000_23.000.wav	13.000	23.000	Bicycle
+-XUfeRTw3b4_0.000_6.000.wav	0.000	6.000	Bicycle
+-XoATxJ-Qcg_30.000_40.000.wav	30.000	40.000	Bicycle
+-bFNxvFwDts_470.000_480.000.wav	470.000	480.000	Bicycle
+-e5PokL6Cyo_30.000_40.000.wav	30.000	40.000	Bicycle
+-fNyOf9zIU0_30.000_40.000.wav	30.000	40.000	Bicycle
+-fhpkRyZL90_30.000_40.000.wav	30.000	40.000	Bicycle
+-fo3m0hiZbg_30.000_40.000.wav	30.000	40.000	Bicycle
+-ikJkNwcmkA_27.000_37.000.wav	27.000	37.000	Bicycle
+-k2nMcxAjWE_30.000_40.000.wav	30.000	40.000	Bicycle
+-k80ibA-fyw_30.000_40.000.wav	30.000	40.000	Bicycle
+-lBcEVa_NKw_30.000_40.000.wav	30.000	40.000	Bicycle
+-mQyAYU_Bd4_50.000_60.000.wav	50.000	60.000	Bicycle
+-ngrinYHF4c_30.000_40.000.wav	30.000	40.000	Bicycle
+-nqm_RJ2xj8_40.000_50.000.wav	40.000	50.000	Bicycle
+-oAw5iTeT1g_40.000_50.000.wav	40.000	50.000	Bicycle
+-p2EMzpTE38_4.000_14.000.wav	4.000	14.000	Bicycle
+-qmfWP_yzn4_30.000_40.000.wav	30.000	40.000	Bicycle
+-0DIFwkUpjQ_50.000_60.000.wav	50.000	60.000	Skateboard
+-53qltVyjpc_180.000_190.000.wav	180.000	190.000	Skateboard
+-5y4jb9eUWs_110.000_120.000.wav	110.000	120.000	Skateboard
+-81kolkG8M0_0.000_8.000.wav	0.000	8.000	Skateboard
+-9dwTSq6JZg_70.000_80.000.wav	70.000	80.000	Skateboard
+-9oKZsjjf_0_20.000_30.000.wav	20.000	30.000	Skateboard
+-AFGfu5zOzQ_30.000_40.000.wav	30.000	40.000	Skateboard
+-DHGwygUsQc_30.000_40.000.wav	30.000	40.000	Skateboard
+-DkuTmIs7_Q_30.000_40.000.wav	30.000	40.000	Skateboard
+-E1E17R7UBA_260.000_270.000.wav	260.000	270.000	Skateboard
+-E1aIXhB4YU_30.000_40.000.wav	30.000	40.000	Skateboard
+-McJLXNN3-o_50.000_60.000.wav	50.000	60.000	Skateboard
+-N7nQ4CXGsY_170.000_180.000.wav	170.000	180.000	Skateboard
+-O5vrHFRzcY_30.000_40.000.wav	30.000	40.000	Skateboard
+-Plh9jAN_Eo_0.000_2.000.wav	0.000	2.000	Skateboard
+-Qd_dXTbgK0_30.000_40.000.wav	30.000	40.000	Skateboard
+-aVZ-H92M_s_0.000_4.000.wav	0.000	4.000	Skateboard
+-cd-Zn8qFxU_90.000_100.000.wav	90.000	100.000	Skateboard
+-esP4loyvjM_60.000_70.000.wav	60.000	70.000	Skateboard
+-iB3a71aPew_30.000_40.000.wav	30.000	40.000	Skateboard
+-lZapwtvwlg_0.000_10.000.wav	0.000	10.000	Skateboard
+-mxMaMJCXL8_180.000_190.000.wav	180.000	190.000	Skateboard
+-nYGTw9Sypg_20.000_30.000.wav	20.000	30.000	Skateboard
+-oS19KshdlM_30.000_40.000.wav	30.000	40.000	Skateboard
+-s6uxc77NWo_40.000_50.000.wav	40.000	50.000	Skateboard
+-sCrXS2kJlA_30.000_40.000.wav	30.000	40.000	Skateboard
+-saCvPTdQ7s_30.000_40.000.wav	30.000	40.000	Skateboard
+-sb-knLiDic_20.000_30.000.wav	20.000	30.000	Skateboard
+-tSwRvqaKWg_90.000_100.000.wav	90.000	100.000	Skateboard
+-x_jV34hVq4_30.000_40.000.wav	30.000	40.000	Skateboard
+--ljM2Kojag_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-4F1TX-T6T4_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-7HVWUwyMig_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-9pUUT-6o8U_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-Ei2LE71Dfg_20.000_30.000.wav	20.000	30.000	Ambulance (siren)
+-LGTb-xyjzA_11.000_21.000.wav	11.000	21.000	Ambulance (siren)
+-Y1qiiugnk8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-YsrLG2K1TE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-ZeMV790MXE_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-d-T8Y9-TOg_17.000_27.000.wav	17.000	27.000	Ambulance (siren)
+-dcrL5JLmvo_11.000_21.000.wav	11.000	21.000	Ambulance (siren)
+-fCSO8SVWZU_6.000_16.000.wav	6.000	16.000	Ambulance (siren)
+-fGFQTGd2nA_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Ambulance (siren)
+-jnQgpHubNI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-k6p9n9y22Q_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-kr4SUjnm88_29.000_39.000.wav	29.000	39.000	Ambulance (siren)
+-lyPnABQhCI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-od8LQAVgno_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-pVEgzu95Nc_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-w-9yF465IY_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-woquFRnQk8_16.000_26.000.wav	16.000	26.000	Ambulance (siren)
+-xz75wUCln8_50.000_60.000.wav	50.000	60.000	Ambulance (siren)
+-yGElLHdkEI_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-yPSgCn9AWo_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+00H_s-krtg8_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+02u3P99INjs_8.000_18.000.wav	8.000	18.000	Ambulance (siren)
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Ambulance (siren)
+0EPK7Pv_lbE_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+-0Eem_FuIto_15.000_25.000.wav	15.000	25.000	Fire engine, fire truck (siren)
+-2sT5oBBWWY_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-45cKZA7Jww_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-4B435WQvag_20.000_30.000.wav	20.000	30.000	Fire engine, fire truck (siren)
+-6qhtwdfGOA_23.000_33.000.wav	23.000	33.000	Fire engine, fire truck (siren)
+-8uyNBFbdFc_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-Jsu4dbuO4A_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-KsPTvgJJVE_350.000_360.000.wav	350.000	360.000	Fire engine, fire truck (siren)
+-PRrNx6_MD0_16.000_26.000.wav	16.000	26.000	Fire engine, fire truck (siren)
+-QBo1W2w8II_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-QX-ddNtUvE_24.000_34.000.wav	24.000	34.000	Fire engine, fire truck (siren)
+-RlUu1el2G4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-SkO97C81Ms_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-T8QHPXfIC4_13.000_23.000.wav	13.000	23.000	Fire engine, fire truck (siren)
+-USiTjZoh88_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-X0vNLwH1C0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-Z3ByS_RCwI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-ZtZOcg3s7M_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-cOjJ0Nvtlw_23.000_33.000.wav	23.000	33.000	Fire engine, fire truck (siren)
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Fire engine, fire truck (siren)
+-eYUCWGQ_wU_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Fire engine, fire truck (siren)
+-hplTh4SGvs_90.000_100.000.wav	90.000	100.000	Fire engine, fire truck (siren)
+-nPhg6Eu4b4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-oCvKmNbhl0_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-oEGuMg8hT4_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-pvaJ4DwtRg_3.000_13.000.wav	3.000	13.000	Fire engine, fire truck (siren)
+-qKRKDTbt4c_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-sJn3uUxpH8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-sfn1NDHWJI_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+-09rxiqNNEs_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-3qh-WFUV2U_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-4JG_Ag99hY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-60NmEaP0is_0.000_10.000.wav	0.000	10.000	Civil defense siren
+-6cTEqIcics_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-6iVBmb5PZU_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-6qp8NjWffE_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-75iY1j3MeY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-E3Yju3lrRo_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-FHSBdx5A3g_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-JhSzxTdcwY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-OtNDK_Hxp8_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-S3_I0RiG3g_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-YMXgDKKAwU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-c7XoYM-SSY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-j8EeIX9ynk_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-t478yabOQw_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-uIyMR9luvg_30.000_40.000.wav	30.000	40.000	Civil defense siren
+-wgP6ua-t4k_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-zGAb18JxmI_30.000_40.000.wav	30.000	40.000	Civil defense siren
+03NLMEMi8-I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0552YhBdeXo_30.000_40.000.wav	30.000	40.000	Civil defense siren
+06TM6z3NvuY_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0CUi0oGUzjU_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0GpUFFJNFH8_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0H_WUo2srs0_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0HvYkBXQ44A_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0JKcTVpby0I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+0PhU-PIsUMw_40.000_50.000.wav	40.000	50.000	Civil defense siren
+-122tCXtFhU_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-1U98XBTyB4_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Police car (siren)
+-6WqJCSmkCw_70.000_80.000.wav	70.000	80.000	Police car (siren)
+-AF7wp3ezww_140.000_150.000.wav	140.000	150.000	Police car (siren)
+-AFASmp1fpk_6.000_16.000.wav	6.000	16.000	Police car (siren)
+-F2lk9A8B8M_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-GPv09qi9A8_120.000_130.000.wav	120.000	130.000	Police car (siren)
+-Hi-WpRGUpc_9.000_19.000.wav	9.000	19.000	Police car (siren)
+-KsPTvgJJVE_350.000_360.000.wav	350.000	360.000	Police car (siren)
+-MfBpxtGQmE_20.000_30.000.wav	20.000	30.000	Police car (siren)
+-Pg4vVPs4bE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-UCf_-3yzWU_290.000_300.000.wav	290.000	300.000	Police car (siren)
+-VULyMtKazE_0.000_7.000.wav	0.000	7.000	Police car (siren)
+-XRiLbb3Syo_2.000_12.000.wav	2.000	12.000	Police car (siren)
+-XrpzGb6xCU_190.000_200.000.wav	190.000	200.000	Police car (siren)
+-YsrLG2K1TE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-ZtZOcg3s7M_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-_8fdnv6Crg_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-az6BooRLxw_40.000_50.000.wav	40.000	50.000	Police car (siren)
+-bs3c27rEtc_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-dBTGdL4RFs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-gKNRXbpAKs_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-hA1yMrEXz0_10.000_20.000.wav	10.000	20.000	Police car (siren)
+-haSUR_IUto_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-l-DEfDAvNA_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-lWs7_49gss_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-lhnhB4rbGw_3.000_13.000.wav	3.000	13.000	Police car (siren)
+-rkJeBBmiTQ_60.000_70.000.wav	60.000	70.000	Police car (siren)
+-rs7FPxzc6w_8.000_18.000.wav	8.000	18.000	Police car (siren)
+-20uudT97E0_30.000_40.000.wav	30.000	40.000	Screaming
+-3bGlOhRkAo_140.000_150.000.wav	140.000	150.000	Screaming
+-4pUrlMafww_1.000_11.000.wav	1.000	11.000	Screaming
+-7R0ybQQAHg_60.000_70.000.wav	60.000	70.000	Screaming
+-7gojlG6bE4_30.000_40.000.wav	30.000	40.000	Screaming
+-GI5PbO6j50_30.000_40.000.wav	30.000	40.000	Screaming
+-MuIRudOtxw_30.000_40.000.wav	30.000	40.000	Screaming
+-WfQBr42ymw_30.000_40.000.wav	30.000	40.000	Screaming
+-YOjIgYspsY_30.000_40.000.wav	30.000	40.000	Screaming
+-g_AcRVFfXU_30.000_40.000.wav	30.000	40.000	Screaming
+-gb5uvwsRpI_30.000_40.000.wav	30.000	40.000	Screaming
+-iAwqlQ3TEk_0.000_3.000.wav	0.000	3.000	Screaming
+-nJoxcmxz5g_30.000_40.000.wav	30.000	40.000	Screaming
+-pwgypWE-J8_30.000_40.000.wav	30.000	40.000	Screaming
+-pzasCR0kpc_30.000_40.000.wav	30.000	40.000	Screaming
+-sUgHKZQKYc_30.000_40.000.wav	30.000	40.000	Screaming
+-uazzQEmQ7c_0.000_10.000.wav	0.000	10.000	Screaming
+-vHJU1wDRsY_30.000_40.000.wav	30.000	40.000	Screaming
+0-RnTXpp8Q0_30.000_40.000.wav	30.000	40.000	Screaming
+09YQukdYVI4_30.000_40.000.wav	30.000	40.000	Screaming
+0Ees8KFCUXM_30.000_40.000.wav	30.000	40.000	Screaming
+0EymGuYWkFk_30.000_40.000.wav	30.000	40.000	Screaming
+0Nw1OyTsaAo_30.000_40.000.wav	30.000	40.000	Screaming
+0YnOMAls83g_30.000_40.000.wav	30.000	40.000	Screaming
+0_gyUQkLCY8_30.000_40.000.wav	30.000	40.000	Screaming
+0_hnDV2SHBI_7.000_17.000.wav	7.000	17.000	Screaming
+0cqEaAkbrbI_80.000_90.000.wav	80.000	90.000	Screaming
+0hC044mDsWA_30.000_40.000.wav	30.000	40.000	Screaming
+0kQANiakiH0_30.000_40.000.wav	30.000	40.000	Screaming
+0rVBXpbgO8s_30.000_40.000.wav	30.000	40.000	Screaming
+---lTs1dxhU_30.000_40.000.wav	30.000	40.000	Car
+--330hg-Ocw_30.000_40.000.wav	30.000	40.000	Car
+--8puiAGLhs_30.000_40.000.wav	30.000	40.000	Car
+--9VR_F7CtY_30.000_40.000.wav	30.000	40.000	Car
+--F70LWypIg_30.000_40.000.wav	30.000	40.000	Car
+--P4wuph3Mc_0.000_8.000.wav	0.000	8.000	Car
+--QvRbvnbUE_30.000_40.000.wav	30.000	40.000	Car
+--SeOZy3Yik_30.000_40.000.wav	30.000	40.000	Car
+--Zz7BgxSUg_30.000_40.000.wav	30.000	40.000	Car
+--e0Vu_ruTc_30.000_40.000.wav	30.000	40.000	Car
+--iFD6IyQW8_30.000_40.000.wav	30.000	40.000	Car
+--jGnLqFsQ4_24.000_34.000.wav	24.000	34.000	Car
+--jc0NAxK8M_30.000_40.000.wav	30.000	40.000	Car
+--v1WjOJv-w_150.000_160.000.wav	150.000	160.000	Car
+--xDffQ9Mwo_30.000_40.000.wav	30.000	40.000	Car
+--yaQA8d1dI_6.000_16.000.wav	6.000	16.000	Car
+--zLzL0sq3M_30.000_40.000.wav	30.000	40.000	Car
+-0-jXXldDOU_10.000_20.000.wav	10.000	20.000	Car
+-03ld83JliM_29.000_39.000.wav	29.000	39.000	Car
+-0B-egfXU7E_30.000_40.000.wav	30.000	40.000	Car
+-0Bkyt8iZ1I_8.000_18.000.wav	8.000	18.000	Car
+-0CIk-OOp7Y_30.000_40.000.wav	30.000	40.000	Car
+-0CRb8H4hzY_4.000_14.000.wav	4.000	14.000	Car
+-0CY5NWBHyY_20.000_30.000.wav	20.000	30.000	Car
+-0HsrVfb5vc_20.000_30.000.wav	20.000	30.000	Car
+-0I89-H0AFo_26.000_36.000.wav	26.000	36.000	Car
+-0P6VDQ1YDs_80.000_90.000.wav	80.000	90.000	Car
+-0PrEsytvc0_30.000_40.000.wav	30.000	40.000	Car
+-0RqnaXZu_E_30.000_40.000.wav	30.000	40.000	Car
+-0Yynyhm1AY_14.000_24.000.wav	14.000	24.000	Car
+---lTs1dxhU_30.000_40.000.wav	30.000	40.000	Car passing by
+--P4wuph3Mc_0.000_8.000.wav	0.000	8.000	Car passing by
+--xDffQ9Mwo_30.000_40.000.wav	30.000	40.000	Car passing by
+--zLzL0sq3M_30.000_40.000.wav	30.000	40.000	Car passing by
+--zbPxnl27o_20.000_30.000.wav	20.000	30.000	Car passing by
+-0CRb8H4hzY_4.000_14.000.wav	4.000	14.000	Car passing by
+-0MnD7jBvkE_0.000_4.000.wav	0.000	4.000	Car passing by
+-0U3c4PN8sc_30.000_40.000.wav	30.000	40.000	Car passing by
+-0Yynyhm1AY_14.000_24.000.wav	14.000	24.000	Car passing by
+-10fWp7Pqs4_30.000_40.000.wav	30.000	40.000	Car passing by
+-14BFlDzjS4_6.000_16.000.wav	6.000	16.000	Car passing by
+-15nPYi2v1g_30.000_40.000.wav	30.000	40.000	Car passing by
+-19pq3HJoBM_30.000_40.000.wav	30.000	40.000	Car passing by
+-1BrkFLHD74_19.000_29.000.wav	19.000	29.000	Car passing by
+-1HlfoHZCEE_6.000_16.000.wav	6.000	16.000	Car passing by
+-1McjOPUzbo_30.000_40.000.wav	30.000	40.000	Car passing by
+-1sGSNmgiPs_4.000_14.000.wav	4.000	14.000	Car passing by
+-2-luek6dI8_30.000_40.000.wav	30.000	40.000	Car passing by
+-21-RfxQscI_30.000_40.000.wav	30.000	40.000	Car passing by
+-25LkbSjEos_30.000_40.000.wav	30.000	40.000	Car passing by
+-2LJWaL2PuA_30.000_40.000.wav	30.000	40.000	Car passing by
+-2ZbvsBSZmY_2.000_12.000.wav	2.000	12.000	Car passing by
+-2cz2qQDmr4_30.000_40.000.wav	30.000	40.000	Car passing by
+-31KUAOSg5U_5.000_15.000.wav	5.000	15.000	Car passing by
+-35qBdzN9ck_30.000_40.000.wav	30.000	40.000	Car passing by
+-3929cmVE20_30.000_40.000.wav	30.000	40.000	Car passing by
+-3M-k4nIYIM_30.000_40.000.wav	30.000	40.000	Car passing by
+-3MNphBfq_0_30.000_40.000.wav	30.000	40.000	Car passing by
+-3_RSVYKkkk_30.000_40.000.wav	30.000	40.000	Car passing by
+-3exNVlj92w_30.000_40.000.wav	30.000	40.000	Car passing by
+--0w1YA1Hm4_30.000_40.000.wav	30.000	40.000	Bus
+-0_vEaaXndY_11.000_21.000.wav	11.000	21.000	Bus
+-5GcZwBvBdI_30.000_40.000.wav	30.000	40.000	Bus
+-5digoPWn6U_8.000_18.000.wav	8.000	18.000	Bus
+-79l4w4DsYM_30.000_40.000.wav	30.000	40.000	Bus
+-7B4pbkIEas_30.000_40.000.wav	30.000	40.000	Bus
+-8YTu7ZGA2w_30.000_40.000.wav	30.000	40.000	Bus
+-93IM29_8rs_14.000_24.000.wav	14.000	24.000	Bus
+-9GhPxGkpio_26.000_36.000.wav	26.000	36.000	Bus
+-9J9xs7LM9Y_25.000_35.000.wav	25.000	35.000	Bus
+-AY_lZLYJR8_8.000_18.000.wav	8.000	18.000	Bus
+-AdQBgtN_4E_30.000_40.000.wav	30.000	40.000	Bus
+-BxfsWlPUPY_30.000_40.000.wav	30.000	40.000	Bus
+-CgCr8Eknm0_14.000_24.000.wav	14.000	24.000	Bus
+-CnsvTDIXdE_20.000_30.000.wav	20.000	30.000	Bus
+-CpMlnGhxEU_0.000_9.000.wav	0.000	9.000	Bus
+-DP_cv0x_Ng_30.000_40.000.wav	30.000	40.000	Bus
+-FEXRjcryZE_30.000_40.000.wav	30.000	40.000	Bus
+-Fp2-w-iLiE_20.000_30.000.wav	20.000	30.000	Bus
+-GLk6G9U09A_30.000_40.000.wav	30.000	40.000	Bus
+-Ga9sSkpngg_30.000_40.000.wav	30.000	40.000	Bus
+-H8V23dZoLo_0.000_10.000.wav	0.000	10.000	Bus
+-HeQfwKbFzg_30.000_40.000.wav	30.000	40.000	Bus
+-HzzEuFBiDU_30.000_40.000.wav	30.000	40.000	Bus
+-I4INTpMKT4_30.000_40.000.wav	30.000	40.000	Bus
+-II-7qJxKPc_21.000_31.000.wav	21.000	31.000	Bus
+-LnpzyfTkF8_30.000_40.000.wav	30.000	40.000	Bus
+-OgRshQfsi8_30.000_40.000.wav	30.000	40.000	Bus
+-P53lJ1ViWk_30.000_40.000.wav	30.000	40.000	Bus
+-PvNUvEov4Q_30.000_40.000.wav	30.000	40.000	Bus
+--12UOziMF0_30.000_40.000.wav	30.000	40.000	Truck
+--73E04RpiQ_0.000_9.000.wav	0.000	9.000	Truck
+--J947HxQVM_0.000_9.000.wav	0.000	9.000	Truck
+--bD1DVKlzQ_30.000_40.000.wav	30.000	40.000	Truck
+--ivFZu-hlc_30.000_40.000.wav	30.000	40.000	Truck
+--wuU7kzB5o_30.000_40.000.wav	30.000	40.000	Truck
+-0B_CYyG5Dg_30.000_40.000.wav	30.000	40.000	Truck
+-0JqTq_4jaE_40.000_50.000.wav	40.000	50.000	Truck
+-0MrEZKJ5MQ_30.000_40.000.wav	30.000	40.000	Truck
+-0awng26xQ8_30.000_40.000.wav	30.000	40.000	Truck
+-0dq1Vg9rd8_30.000_40.000.wav	30.000	40.000	Truck
+-0wkq7CUYME_310.000_320.000.wav	310.000	320.000	Truck
+-14RXdkqYuI_30.000_40.000.wav	30.000	40.000	Truck
+-1B3CzpiW1M_30.000_40.000.wav	30.000	40.000	Truck
+-1Q21cZhHDE_30.000_40.000.wav	30.000	40.000	Truck
+-1ZXXnBXJ6c_8.000_18.000.wav	8.000	18.000	Truck
+-1s0DWApvT8_30.000_40.000.wav	30.000	40.000	Truck
+-1s84_2Vn4g_30.000_40.000.wav	30.000	40.000	Truck
+-26ansJluVo_30.000_40.000.wav	30.000	40.000	Truck
+-2EscdO0l-A_30.000_40.000.wav	30.000	40.000	Truck
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Truck
+-2NBZUCcvm0_30.000_40.000.wav	30.000	40.000	Truck
+-2sT5oBBWWY_30.000_40.000.wav	30.000	40.000	Truck
+-2vmprMUw10_30.000_40.000.wav	30.000	40.000	Truck
+-2x4TB8VWvE_18.000_28.000.wav	18.000	28.000	Truck
+-39q4y0tt-g_30.000_40.000.wav	30.000	40.000	Truck
+-3N5rjPrNCc_190.000_200.000.wav	190.000	200.000	Truck
+-3NcUIyJtFY_30.000_40.000.wav	30.000	40.000	Truck
+-3PplV0ErOk_30.000_40.000.wav	30.000	40.000	Truck
+-3gSkrDKNSA_27.000_37.000.wav	27.000	37.000	Truck
+--p-rk_HBuU_30.000_40.000.wav	30.000	40.000	Motorcycle
+-1WK72M4xeg_220.000_230.000.wav	220.000	230.000	Motorcycle
+-1XfuJcdvfg_30.000_40.000.wav	30.000	40.000	Motorcycle
+-3XWBAmjmaQ_11.000_21.000.wav	11.000	21.000	Motorcycle
+-4-87UgJcUw_70.000_80.000.wav	70.000	80.000	Motorcycle
+-4D3Gkyisyc_30.000_40.000.wav	30.000	40.000	Motorcycle
+-5k5GyHd2So_4.000_14.000.wav	4.000	14.000	Motorcycle
+-6A2L1U9b5Y_54.000_64.000.wav	54.000	64.000	Motorcycle
+-6Yfati1N10_80.000_90.000.wav	80.000	90.000	Motorcycle
+-7_o_GhpZpM_12.000_22.000.wav	12.000	22.000	Motorcycle
+-7rZwMK6uSs_70.000_80.000.wav	70.000	80.000	Motorcycle
+-85f5DKKfSo_30.000_40.000.wav	30.000	40.000	Motorcycle
+-9Smdrt5zwk_40.000_50.000.wav	40.000	50.000	Motorcycle
+-9gZLVDKpnE_30.000_40.000.wav	30.000	40.000	Motorcycle
+-BGebo8V4XY_30.000_40.000.wav	30.000	40.000	Motorcycle
+-DdiduB5B_w_190.000_200.000.wav	190.000	200.000	Motorcycle
+-HIPq7T3eFI_11.000_21.000.wav	11.000	21.000	Motorcycle
+-H_3oEkKe0M_50.000_60.000.wav	50.000	60.000	Motorcycle
+-HmuMoykRqA_500.000_510.000.wav	500.000	510.000	Motorcycle
+-IMRE_psvtI_30.000_40.000.wav	30.000	40.000	Motorcycle
+-Ie4LSPDEF4_6.000_16.000.wav	6.000	16.000	Motorcycle
+-J0F29UCZiA_70.000_80.000.wav	70.000	80.000	Motorcycle
+-KFCJ7ydu2E_0.000_10.000.wav	0.000	10.000	Motorcycle
+-KmDAgYb0Uo_100.000_110.000.wav	100.000	110.000	Motorcycle
+-P7iW3WzNfc_400.000_410.000.wav	400.000	410.000	Motorcycle
+-QMAKXzIGx4_10.000_20.000.wav	10.000	20.000	Motorcycle
+-S-5z2vYtxw_10.000_20.000.wav	10.000	20.000	Motorcycle
+-SlL0NZh51w_30.000_40.000.wav	30.000	40.000	Motorcycle
+-US2mpJxbj4_30.000_40.000.wav	30.000	40.000	Motorcycle
+-VO-C9C0uqY_1.000_11.000.wav	1.000	11.000	Motorcycle
+--H_-CEB2wA_30.000_40.000.wav	30.000	40.000	Train
+-1VsFy0eVJs_30.000_40.000.wav	30.000	40.000	Train
+-1X7kpLnOpM_60.000_70.000.wav	60.000	70.000	Train
+-3FIglJti0s_30.000_40.000.wav	30.000	40.000	Train
+-5QrBL6MzLg_60.000_70.000.wav	60.000	70.000	Train
+-6KOEEiAf9s_19.000_29.000.wav	19.000	29.000	Train
+-97l_c6PToE_30.000_40.000.wav	30.000	40.000	Train
+-9S5Z-uciLo_70.000_80.000.wav	70.000	80.000	Train
+-CkgGfKepO4_140.000_150.000.wav	140.000	150.000	Train
+-E0shPRxAbo_30.000_40.000.wav	30.000	40.000	Train
+-Gbohom8C4Q_30.000_40.000.wav	30.000	40.000	Train
+-JpQivta6MQ_20.000_30.000.wav	20.000	30.000	Train
+-K9oTZj3mVQ_30.000_40.000.wav	30.000	40.000	Train
+-KjE40DlSdU_0.000_10.000.wav	0.000	10.000	Train
+-NrFtZ_xxFU_30.000_40.000.wav	30.000	40.000	Train
+-PYRamK58Ss_0.000_10.000.wav	0.000	10.000	Train
+-P_XDJt4p_s_30.000_40.000.wav	30.000	40.000	Train
+-Pjylzex7oc_350.000_360.000.wav	350.000	360.000	Train
+-QHuZGmIy_I_30.000_40.000.wav	30.000	40.000	Train
+-Qfk_Q2ctBs_30.000_40.000.wav	30.000	40.000	Train
+-RXKRoRPWXg_30.000_40.000.wav	30.000	40.000	Train
+-VH414svzI0_30.000_40.000.wav	30.000	40.000	Train
+-WFdYxE-PYI_30.000_40.000.wav	30.000	40.000	Train
+-Wd1pV7UjWg_60.000_70.000.wav	60.000	70.000	Train
+-XcC-UlbcRA_30.000_40.000.wav	30.000	40.000	Train
+-Y2cD8xvCHI_30.000_40.000.wav	30.000	40.000	Train
+-ZKZkMHe3cY_70.000_80.000.wav	70.000	80.000	Train
+-Zq22n4OewA_30.000_40.000.wav	30.000	40.000	Train
+-aZ7XC4LG2A_30.000_40.000.wav	30.000	40.000	Train
+-abVemAm9HM_430.000_440.000.wav	430.000	440.000	Train
+1T1i2rny8RU_30.000_40.000.wav	30.000	40.000	Ambulance (siren)
+7DC3HtNi4fU_160.000_170.000.wav	160.000	170.000	Ambulance (siren)
+-z8jsgl3iHE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+00H_s-krtg8_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Fire engine, fire truck (siren)
+4l78f9VZ9uE_30.000_40.000.wav	30.000	40.000	Fire engine, fire truck (siren)
+35EOmSMTQ6I_30.000_40.000.wav	30.000	40.000	Civil defense siren
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Police car (siren)
+0EPK7Pv_lbE_30.000_40.000.wav	30.000	40.000	Police car (siren)
+0I6Mlp27_gM_30.000_40.000.wav	30.000	40.000	Police car (siren)
+17VuPl9Wxvs_20.000_30.000.wav	20.000	30.000	Police car (siren)
+4A1Ar1TIXIY_30.000_40.000.wav	30.000	40.000	Police car (siren)
+-10fWp7Pqs4_30.000_40.000.wav	30.000	40.000	Car
+-122tCXtFhU_30.000_40.000.wav	30.000	40.000	Car
+-14BFlDzjS4_6.000_16.000.wav	6.000	16.000	Car
+-1BrkFLHD74_19.000_29.000.wav	19.000	29.000	Car
+-1HlfoHZCEE_6.000_16.000.wav	6.000	16.000	Car
+-1McjOPUzbo_30.000_40.000.wav	30.000	40.000	Car
+-1sGSNmgiPs_4.000_14.000.wav	4.000	14.000	Car
+-25LkbSjEos_30.000_40.000.wav	30.000	40.000	Car
+-2GlU3e0nTU_170.000_180.000.wav	170.000	180.000	Car
+-2LJWaL2PuA_30.000_40.000.wav	30.000	40.000	Car
+-2ZbvsBSZmY_2.000_12.000.wav	2.000	12.000	Car
+-2cz2qQDmr4_30.000_40.000.wav	30.000	40.000	Car
+-31KUAOSg5U_5.000_15.000.wav	5.000	15.000	Car
+-35qBdzN9ck_30.000_40.000.wav	30.000	40.000	Car
+-3929cmVE20_30.000_40.000.wav	30.000	40.000	Car
+-3M-k4nIYIM_30.000_40.000.wav	30.000	40.000	Car
+-3MNphBfq_0_30.000_40.000.wav	30.000	40.000	Car
+-3_RSVYKkkk_30.000_40.000.wav	30.000	40.000	Car
+-AF7wp3ezww_140.000_150.000.wav	140.000	150.000	Car
+-Pg4vVPs4bE_30.000_40.000.wav	30.000	40.000	Car
+-VULyMtKazE_0.000_7.000.wav	0.000	7.000	Car
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Car
+06RreMb5qbE_0.000_10.000.wav	0.000	10.000	Car
+0E4AqW9dmdk_30.000_40.000.wav	30.000	40.000	Car
+0Hz4R_m0hmI_80.000_90.000.wav	80.000	90.000	Car
+4Kpklmj-ze0_53.000_63.000.wav	53.000	63.000	Car
+5tzTahLHylw_70.000_80.000.wav	70.000	80.000	Car
+7NJ5TbNEIvA_250.000_260.000.wav	250.000	260.000	Car
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car
+9jYv9WuyknA_130.000_140.000.wav	130.000	140.000	Car
+-l-DEfDAvNA_30.000_40.000.wav	30.000	40.000	Car passing by
+9fCibkUT_gQ_30.000_40.000.wav	30.000	40.000	Car passing by
+-jj2tyuf6-A_80.000_90.000.wav	80.000	90.000	Bus
+-45cKZA7Jww_30.000_40.000.wav	30.000	40.000	Truck
+-4B435WQvag_20.000_30.000.wav	20.000	30.000	Truck
+-60XojQWWoc_30.000_40.000.wav	30.000	40.000	Truck
+-6qhtwdfGOA_23.000_33.000.wav	23.000	33.000	Truck
+-8OITuFZha8_30.000_40.000.wav	30.000	40.000	Truck
+-8n2NqDFRko_30.000_40.000.wav	30.000	40.000	Truck
+-AIrHVeCgtM_30.000_40.000.wav	30.000	40.000	Truck
+-AVzYvKHwPg_30.000_40.000.wav	30.000	40.000	Truck
+-BM_EAszxBg_30.000_40.000.wav	30.000	40.000	Truck
+-Ei2LE71Dfg_20.000_30.000.wav	20.000	30.000	Truck
+-FWkB2IDMhc_30.000_40.000.wav	30.000	40.000	Truck
+-Jsu4dbuO4A_30.000_40.000.wav	30.000	40.000	Truck
+-PRrNx6_MD0_16.000_26.000.wav	16.000	26.000	Truck
+-X0vNLwH1C0_30.000_40.000.wav	30.000	40.000	Truck
+-cbYvBBXE6A_12.000_22.000.wav	12.000	22.000	Truck
+-oCvKmNbhl0_30.000_40.000.wav	30.000	40.000	Truck
+-oV6dQu5tZo_30.000_40.000.wav	30.000	40.000	Truck
+-qKRKDTbt4c_30.000_40.000.wav	30.000	40.000	Truck
+-r8mfjRiHrU_30.000_40.000.wav	30.000	40.000	Truck
+-s9kwrRilOY_30.000_40.000.wav	30.000	40.000	Truck
+-uMiGr6xvRA_30.000_40.000.wav	30.000	40.000	Truck
+-x70B12Mb-8_30.000_40.000.wav	30.000	40.000	Truck
+-xYsfYZOI-Y_30.000_40.000.wav	30.000	40.000	Truck
+-zxrdL6MlKI_30.000_40.000.wav	30.000	40.000	Truck
+0C3kqtF76t8_50.000_60.000.wav	50.000	60.000	Truck
+0HmiH-wKLB4_30.000_40.000.wav	30.000	40.000	Truck
+0KskqFt3DoY_15.000_25.000.wav	15.000	25.000	Truck
+0OiPtV9sd_w_30.000_40.000.wav	30.000	40.000	Truck
+0VnoYVqd-yo_30.000_40.000.wav	30.000	40.000	Truck
+3YaLkgUMhAA_110.000_120.000.wav	110.000	120.000	Truck
+-nGBPqlRNg4_30.000_40.000.wav	30.000	40.000	Train
+02w3vd_GgF0_390.000_400.000.wav	390.000	400.000	Train
+0HqeYIREv8M_30.000_40.000.wav	30.000	40.000	Train
+0IpYF91Fdt0_80.000_90.000.wav	80.000	90.000	Train
+0NaZejdABG0_90.000_100.000.wav	90.000	100.000	Train
+0RurXUfKyow_4.000_14.000.wav	4.000	14.000	Train
+0_HnD-rW3lI_170.000_180.000.wav	170.000	180.000	Train
+10i60V1RZkQ_210.000_220.000.wav	210.000	220.000	Train
+1FJY5X1iY9I_170.000_180.000.wav	170.000	180.000	Train
+1U0Ty6CW6AM_40.000_50.000.wav	40.000	50.000	Train
+1hQLr88iCvg_30.000_40.000.wav	30.000	40.000	Train
+1iUXERALOOs_190.000_200.000.wav	190.000	200.000	Train
+1iWFlLpixKU_5.000_15.000.wav	5.000	15.000	Train
+1oJAVJPX0YY_20.000_30.000.wav	20.000	30.000	Train
+26dNsDuIt9Q_340.000_350.000.wav	340.000	350.000	Train
+2BMHsKLcb7E_90.000_100.000.wav	90.000	100.000	Train
+2RpOd9MJjyQ_10.000_20.000.wav	10.000	20.000	Train
+2U4wSdl10to_200.000_210.000.wav	200.000	210.000	Train
+2aBV6AZt5nk_570.000_580.000.wav	570.000	580.000	Train
+3ntFslTK6hM_90.000_100.000.wav	90.000	100.000	Train
diff --git a/audio_detection/audio_infer/metadata/class_labels_indices.csv b/audio_detection/audio_infer/metadata/class_labels_indices.csv
new file mode 100644
index 0000000..3a2767e
--- /dev/null
+++ b/audio_detection/audio_infer/metadata/class_labels_indices.csv
@@ -0,0 +1,528 @@
+index,mid,display_name
+0,/m/09x0r,"Speech"
+1,/m/05zppz,"Male speech, man speaking"
+2,/m/02zsn,"Female speech, woman speaking"
+3,/m/0ytgt,"Child speech, kid speaking"
+4,/m/01h8n0,"Conversation"
+5,/m/02qldy,"Narration, monologue"
+6,/m/0261r1,"Babbling"
+7,/m/0brhx,"Speech synthesizer"
+8,/m/07p6fty,"Shout"
+9,/m/07q4ntr,"Bellow"
+10,/m/07rwj3x,"Whoop"
+11,/m/07sr1lc,"Yell"
+12,/m/04gy_2,"Battle cry"
+13,/t/dd00135,"Children shouting"
+14,/m/03qc9zr,"Screaming"
+15,/m/02rtxlg,"Whispering"
+16,/m/01j3sz,"Laughter"
+17,/t/dd00001,"Baby laughter"
+18,/m/07r660_,"Giggle"
+19,/m/07s04w4,"Snicker"
+20,/m/07sq110,"Belly laugh"
+21,/m/07rgt08,"Chuckle, chortle"
+22,/m/0463cq4,"Crying, sobbing"
+23,/t/dd00002,"Baby cry, infant cry"
+24,/m/07qz6j3,"Whimper"
+25,/m/07qw_06,"Wail, moan"
+26,/m/07plz5l,"Sigh"
+27,/m/015lz1,"Singing"
+28,/m/0l14jd,"Choir"
+29,/m/01swy6,"Yodeling"
+30,/m/02bk07,"Chant"
+31,/m/01c194,"Mantra"
+32,/t/dd00003,"Male singing"
+33,/t/dd00004,"Female singing"
+34,/t/dd00005,"Child singing"
+35,/t/dd00006,"Synthetic singing"
+36,/m/06bxc,"Rapping"
+37,/m/02fxyj,"Humming"
+38,/m/07s2xch,"Groan"
+39,/m/07r4k75,"Grunt"
+40,/m/01w250,"Whistling"
+41,/m/0lyf6,"Breathing"
+42,/m/07mzm6,"Wheeze"
+43,/m/01d3sd,"Snoring"
+44,/m/07s0dtb,"Gasp"
+45,/m/07pyy8b,"Pant"
+46,/m/07q0yl5,"Snort"
+47,/m/01b_21,"Cough"
+48,/m/0dl9sf8,"Throat clearing"
+49,/m/01hsr_,"Sneeze"
+50,/m/07ppn3j,"Sniff"
+51,/m/06h7j,"Run"
+52,/m/07qv_x_,"Shuffle"
+53,/m/07pbtc8,"Walk, footsteps"
+54,/m/03cczk,"Chewing, mastication"
+55,/m/07pdhp0,"Biting"
+56,/m/0939n_,"Gargling"
+57,/m/01g90h,"Stomach rumble"
+58,/m/03q5_w,"Burping, eructation"
+59,/m/02p3nc,"Hiccup"
+60,/m/02_nn,"Fart"
+61,/m/0k65p,"Hands"
+62,/m/025_jnm,"Finger snapping"
+63,/m/0l15bq,"Clapping"
+64,/m/01jg02,"Heart sounds, heartbeat"
+65,/m/01jg1z,"Heart murmur"
+66,/m/053hz1,"Cheering"
+67,/m/028ght,"Applause"
+68,/m/07rkbfh,"Chatter"
+69,/m/03qtwd,"Crowd"
+70,/m/07qfr4h,"Hubbub, speech noise, speech babble"
+71,/t/dd00013,"Children playing"
+72,/m/0jbk,"Animal"
+73,/m/068hy,"Domestic animals, pets"
+74,/m/0bt9lr,"Dog"
+75,/m/05tny_,"Bark"
+76,/m/07r_k2n,"Yip"
+77,/m/07qf0zm,"Howl"
+78,/m/07rc7d9,"Bow-wow"
+79,/m/0ghcn6,"Growling"
+80,/t/dd00136,"Whimper (dog)"
+81,/m/01yrx,"Cat"
+82,/m/02yds9,"Purr"
+83,/m/07qrkrw,"Meow"
+84,/m/07rjwbb,"Hiss"
+85,/m/07r81j2,"Caterwaul"
+86,/m/0ch8v,"Livestock, farm animals, working animals"
+87,/m/03k3r,"Horse"
+88,/m/07rv9rh,"Clip-clop"
+89,/m/07q5rw0,"Neigh, whinny"
+90,/m/01xq0k1,"Cattle, bovinae"
+91,/m/07rpkh9,"Moo"
+92,/m/0239kh,"Cowbell"
+93,/m/068zj,"Pig"
+94,/t/dd00018,"Oink"
+95,/m/03fwl,"Goat"
+96,/m/07q0h5t,"Bleat"
+97,/m/07bgp,"Sheep"
+98,/m/025rv6n,"Fowl"
+99,/m/09b5t,"Chicken, rooster"
+100,/m/07st89h,"Cluck"
+101,/m/07qn5dc,"Crowing, cock-a-doodle-doo"
+102,/m/01rd7k,"Turkey"
+103,/m/07svc2k,"Gobble"
+104,/m/09ddx,"Duck"
+105,/m/07qdb04,"Quack"
+106,/m/0dbvp,"Goose"
+107,/m/07qwf61,"Honk"
+108,/m/01280g,"Wild animals"
+109,/m/0cdnk,"Roaring cats (lions, tigers)"
+110,/m/04cvmfc,"Roar"
+111,/m/015p6,"Bird"
+112,/m/020bb7,"Bird vocalization, bird call, bird song"
+113,/m/07pggtn,"Chirp, tweet"
+114,/m/07sx8x_,"Squawk"
+115,/m/0h0rv,"Pigeon, dove"
+116,/m/07r_25d,"Coo"
+117,/m/04s8yn,"Crow"
+118,/m/07r5c2p,"Caw"
+119,/m/09d5_,"Owl"
+120,/m/07r_80w,"Hoot"
+121,/m/05_wcq,"Bird flight, flapping wings"
+122,/m/01z5f,"Canidae, dogs, wolves"
+123,/m/06hps,"Rodents, rats, mice"
+124,/m/04rmv,"Mouse"
+125,/m/07r4gkf,"Patter"
+126,/m/03vt0,"Insect"
+127,/m/09xqv,"Cricket"
+128,/m/09f96,"Mosquito"
+129,/m/0h2mp,"Fly, housefly"
+130,/m/07pjwq1,"Buzz"
+131,/m/01h3n,"Bee, wasp, etc."
+132,/m/09ld4,"Frog"
+133,/m/07st88b,"Croak"
+134,/m/078jl,"Snake"
+135,/m/07qn4z3,"Rattle"
+136,/m/032n05,"Whale vocalization"
+137,/m/04rlf,"Music"
+138,/m/04szw,"Musical instrument"
+139,/m/0fx80y,"Plucked string instrument"
+140,/m/0342h,"Guitar"
+141,/m/02sgy,"Electric guitar"
+142,/m/018vs,"Bass guitar"
+143,/m/042v_gx,"Acoustic guitar"
+144,/m/06w87,"Steel guitar, slide guitar"
+145,/m/01glhc,"Tapping (guitar technique)"
+146,/m/07s0s5r,"Strum"
+147,/m/018j2,"Banjo"
+148,/m/0jtg0,"Sitar"
+149,/m/04rzd,"Mandolin"
+150,/m/01bns_,"Zither"
+151,/m/07xzm,"Ukulele"
+152,/m/05148p4,"Keyboard (musical)"
+153,/m/05r5c,"Piano"
+154,/m/01s0ps,"Electric piano"
+155,/m/013y1f,"Organ"
+156,/m/03xq_f,"Electronic organ"
+157,/m/03gvt,"Hammond organ"
+158,/m/0l14qv,"Synthesizer"
+159,/m/01v1d8,"Sampler"
+160,/m/03q5t,"Harpsichord"
+161,/m/0l14md,"Percussion"
+162,/m/02hnl,"Drum kit"
+163,/m/0cfdd,"Drum machine"
+164,/m/026t6,"Drum"
+165,/m/06rvn,"Snare drum"
+166,/m/03t3fj,"Rimshot"
+167,/m/02k_mr,"Drum roll"
+168,/m/0bm02,"Bass drum"
+169,/m/011k_j,"Timpani"
+170,/m/01p970,"Tabla"
+171,/m/01qbl,"Cymbal"
+172,/m/03qtq,"Hi-hat"
+173,/m/01sm1g,"Wood block"
+174,/m/07brj,"Tambourine"
+175,/m/05r5wn,"Rattle (instrument)"
+176,/m/0xzly,"Maraca"
+177,/m/0mbct,"Gong"
+178,/m/016622,"Tubular bells"
+179,/m/0j45pbj,"Mallet percussion"
+180,/m/0dwsp,"Marimba, xylophone"
+181,/m/0dwtp,"Glockenspiel"
+182,/m/0dwt5,"Vibraphone"
+183,/m/0l156b,"Steelpan"
+184,/m/05pd6,"Orchestra"
+185,/m/01kcd,"Brass instrument"
+186,/m/0319l,"French horn"
+187,/m/07gql,"Trumpet"
+188,/m/07c6l,"Trombone"
+189,/m/0l14_3,"Bowed string instrument"
+190,/m/02qmj0d,"String section"
+191,/m/07y_7,"Violin, fiddle"
+192,/m/0d8_n,"Pizzicato"
+193,/m/01xqw,"Cello"
+194,/m/02fsn,"Double bass"
+195,/m/085jw,"Wind instrument, woodwind instrument"
+196,/m/0l14j_,"Flute"
+197,/m/06ncr,"Saxophone"
+198,/m/01wy6,"Clarinet"
+199,/m/03m5k,"Harp"
+200,/m/0395lw,"Bell"
+201,/m/03w41f,"Church bell"
+202,/m/027m70_,"Jingle bell"
+203,/m/0gy1t2s,"Bicycle bell"
+204,/m/07n_g,"Tuning fork"
+205,/m/0f8s22,"Chime"
+206,/m/026fgl,"Wind chime"
+207,/m/0150b9,"Change ringing (campanology)"
+208,/m/03qjg,"Harmonica"
+209,/m/0mkg,"Accordion"
+210,/m/0192l,"Bagpipes"
+211,/m/02bxd,"Didgeridoo"
+212,/m/0l14l2,"Shofar"
+213,/m/07kc_,"Theremin"
+214,/m/0l14t7,"Singing bowl"
+215,/m/01hgjl,"Scratching (performance technique)"
+216,/m/064t9,"Pop music"
+217,/m/0glt670,"Hip hop music"
+218,/m/02cz_7,"Beatboxing"
+219,/m/06by7,"Rock music"
+220,/m/03lty,"Heavy metal"
+221,/m/05r6t,"Punk rock"
+222,/m/0dls3,"Grunge"
+223,/m/0dl5d,"Progressive rock"
+224,/m/07sbbz2,"Rock and roll"
+225,/m/05w3f,"Psychedelic rock"
+226,/m/06j6l,"Rhythm and blues"
+227,/m/0gywn,"Soul music"
+228,/m/06cqb,"Reggae"
+229,/m/01lyv,"Country"
+230,/m/015y_n,"Swing music"
+231,/m/0gg8l,"Bluegrass"
+232,/m/02x8m,"Funk"
+233,/m/02w4v,"Folk music"
+234,/m/06j64v,"Middle Eastern music"
+235,/m/03_d0,"Jazz"
+236,/m/026z9,"Disco"
+237,/m/0ggq0m,"Classical music"
+238,/m/05lls,"Opera"
+239,/m/02lkt,"Electronic music"
+240,/m/03mb9,"House music"
+241,/m/07gxw,"Techno"
+242,/m/07s72n,"Dubstep"
+243,/m/0283d,"Drum and bass"
+244,/m/0m0jc,"Electronica"
+245,/m/08cyft,"Electronic dance music"
+246,/m/0fd3y,"Ambient music"
+247,/m/07lnk,"Trance music"
+248,/m/0g293,"Music of Latin America"
+249,/m/0ln16,"Salsa music"
+250,/m/0326g,"Flamenco"
+251,/m/0155w,"Blues"
+252,/m/05fw6t,"Music for children"
+253,/m/02v2lh,"New-age music"
+254,/m/0y4f8,"Vocal music"
+255,/m/0z9c,"A capella"
+256,/m/0164x2,"Music of Africa"
+257,/m/0145m,"Afrobeat"
+258,/m/02mscn,"Christian music"
+259,/m/016cjb,"Gospel music"
+260,/m/028sqc,"Music of Asia"
+261,/m/015vgc,"Carnatic music"
+262,/m/0dq0md,"Music of Bollywood"
+263,/m/06rqw,"Ska"
+264,/m/02p0sh1,"Traditional music"
+265,/m/05rwpb,"Independent music"
+266,/m/074ft,"Song"
+267,/m/025td0t,"Background music"
+268,/m/02cjck,"Theme music"
+269,/m/03r5q_,"Jingle (music)"
+270,/m/0l14gg,"Soundtrack music"
+271,/m/07pkxdp,"Lullaby"
+272,/m/01z7dr,"Video game music"
+273,/m/0140xf,"Christmas music"
+274,/m/0ggx5q,"Dance music"
+275,/m/04wptg,"Wedding music"
+276,/t/dd00031,"Happy music"
+277,/t/dd00032,"Funny music"
+278,/t/dd00033,"Sad music"
+279,/t/dd00034,"Tender music"
+280,/t/dd00035,"Exciting music"
+281,/t/dd00036,"Angry music"
+282,/t/dd00037,"Scary music"
+283,/m/03m9d0z,"Wind"
+284,/m/09t49,"Rustling leaves"
+285,/t/dd00092,"Wind noise (microphone)"
+286,/m/0jb2l,"Thunderstorm"
+287,/m/0ngt1,"Thunder"
+288,/m/0838f,"Water"
+289,/m/06mb1,"Rain"
+290,/m/07r10fb,"Raindrop"
+291,/t/dd00038,"Rain on surface"
+292,/m/0j6m2,"Stream"
+293,/m/0j2kx,"Waterfall"
+294,/m/05kq4,"Ocean"
+295,/m/034srq,"Waves, surf"
+296,/m/06wzb,"Steam"
+297,/m/07swgks,"Gurgling"
+298,/m/02_41,"Fire"
+299,/m/07pzfmf,"Crackle"
+300,/m/07yv9,"Vehicle"
+301,/m/019jd,"Boat, Water vehicle"
+302,/m/0hsrw,"Sailboat, sailing ship"
+303,/m/056ks2,"Rowboat, canoe, kayak"
+304,/m/02rlv9,"Motorboat, speedboat"
+305,/m/06q74,"Ship"
+306,/m/012f08,"Motor vehicle (road)"
+307,/m/0k4j,"Car"
+308,/m/0912c9,"Vehicle horn, car horn, honking"
+309,/m/07qv_d5,"Toot"
+310,/m/02mfyn,"Car alarm"
+311,/m/04gxbd,"Power windows, electric windows"
+312,/m/07rknqz,"Skidding"
+313,/m/0h9mv,"Tire squeal"
+314,/t/dd00134,"Car passing by"
+315,/m/0ltv,"Race car, auto racing"
+316,/m/07r04,"Truck"
+317,/m/0gvgw0,"Air brake"
+318,/m/05x_td,"Air horn, truck horn"
+319,/m/02rhddq,"Reversing beeps"
+320,/m/03cl9h,"Ice cream truck, ice cream van"
+321,/m/01bjv,"Bus"
+322,/m/03j1ly,"Emergency vehicle"
+323,/m/04qvtq,"Police car (siren)"
+324,/m/012n7d,"Ambulance (siren)"
+325,/m/012ndj,"Fire engine, fire truck (siren)"
+326,/m/04_sv,"Motorcycle"
+327,/m/0btp2,"Traffic noise, roadway noise"
+328,/m/06d_3,"Rail transport"
+329,/m/07jdr,"Train"
+330,/m/04zmvq,"Train whistle"
+331,/m/0284vy3,"Train horn"
+332,/m/01g50p,"Railroad car, train wagon"
+333,/t/dd00048,"Train wheels squealing"
+334,/m/0195fx,"Subway, metro, underground"
+335,/m/0k5j,"Aircraft"
+336,/m/014yck,"Aircraft engine"
+337,/m/04229,"Jet engine"
+338,/m/02l6bg,"Propeller, airscrew"
+339,/m/09ct_,"Helicopter"
+340,/m/0cmf2,"Fixed-wing aircraft, airplane"
+341,/m/0199g,"Bicycle"
+342,/m/06_fw,"Skateboard"
+343,/m/02mk9,"Engine"
+344,/t/dd00065,"Light engine (high frequency)"
+345,/m/08j51y,"Dental drill, dentist's drill"
+346,/m/01yg9g,"Lawn mower"
+347,/m/01j4z9,"Chainsaw"
+348,/t/dd00066,"Medium engine (mid frequency)"
+349,/t/dd00067,"Heavy engine (low frequency)"
+350,/m/01h82_,"Engine knocking"
+351,/t/dd00130,"Engine starting"
+352,/m/07pb8fc,"Idling"
+353,/m/07q2z82,"Accelerating, revving, vroom"
+354,/m/02dgv,"Door"
+355,/m/03wwcy,"Doorbell"
+356,/m/07r67yg,"Ding-dong"
+357,/m/02y_763,"Sliding door"
+358,/m/07rjzl8,"Slam"
+359,/m/07r4wb8,"Knock"
+360,/m/07qcpgn,"Tap"
+361,/m/07q6cd_,"Squeak"
+362,/m/0642b4,"Cupboard open or close"
+363,/m/0fqfqc,"Drawer open or close"
+364,/m/04brg2,"Dishes, pots, and pans"
+365,/m/023pjk,"Cutlery, silverware"
+366,/m/07pn_8q,"Chopping (food)"
+367,/m/0dxrf,"Frying (food)"
+368,/m/0fx9l,"Microwave oven"
+369,/m/02pjr4,"Blender"
+370,/m/02jz0l,"Water tap, faucet"
+371,/m/0130jx,"Sink (filling or washing)"
+372,/m/03dnzn,"Bathtub (filling or washing)"
+373,/m/03wvsk,"Hair dryer"
+374,/m/01jt3m,"Toilet flush"
+375,/m/012xff,"Toothbrush"
+376,/m/04fgwm,"Electric toothbrush"
+377,/m/0d31p,"Vacuum cleaner"
+378,/m/01s0vc,"Zipper (clothing)"
+379,/m/03v3yw,"Keys jangling"
+380,/m/0242l,"Coin (dropping)"
+381,/m/01lsmm,"Scissors"
+382,/m/02g901,"Electric shaver, electric razor"
+383,/m/05rj2,"Shuffling cards"
+384,/m/0316dw,"Typing"
+385,/m/0c2wf,"Typewriter"
+386,/m/01m2v,"Computer keyboard"
+387,/m/081rb,"Writing"
+388,/m/07pp_mv,"Alarm"
+389,/m/07cx4,"Telephone"
+390,/m/07pp8cl,"Telephone bell ringing"
+391,/m/01hnzm,"Ringtone"
+392,/m/02c8p,"Telephone dialing, DTMF"
+393,/m/015jpf,"Dial tone"
+394,/m/01z47d,"Busy signal"
+395,/m/046dlr,"Alarm clock"
+396,/m/03kmc9,"Siren"
+397,/m/0dgbq,"Civil defense siren"
+398,/m/030rvx,"Buzzer"
+399,/m/01y3hg,"Smoke detector, smoke alarm"
+400,/m/0c3f7m,"Fire alarm"
+401,/m/04fq5q,"Foghorn"
+402,/m/0l156k,"Whistle"
+403,/m/06hck5,"Steam whistle"
+404,/t/dd00077,"Mechanisms"
+405,/m/02bm9n,"Ratchet, pawl"
+406,/m/01x3z,"Clock"
+407,/m/07qjznt,"Tick"
+408,/m/07qjznl,"Tick-tock"
+409,/m/0l7xg,"Gears"
+410,/m/05zc1,"Pulleys"
+411,/m/0llzx,"Sewing machine"
+412,/m/02x984l,"Mechanical fan"
+413,/m/025wky1,"Air conditioning"
+414,/m/024dl,"Cash register"
+415,/m/01m4t,"Printer"
+416,/m/0dv5r,"Camera"
+417,/m/07bjf,"Single-lens reflex camera"
+418,/m/07k1x,"Tools"
+419,/m/03l9g,"Hammer"
+420,/m/03p19w,"Jackhammer"
+421,/m/01b82r,"Sawing"
+422,/m/02p01q,"Filing (rasp)"
+423,/m/023vsd,"Sanding"
+424,/m/0_ksk,"Power tool"
+425,/m/01d380,"Drill"
+426,/m/014zdl,"Explosion"
+427,/m/032s66,"Gunshot, gunfire"
+428,/m/04zjc,"Machine gun"
+429,/m/02z32qm,"Fusillade"
+430,/m/0_1c,"Artillery fire"
+431,/m/073cg4,"Cap gun"
+432,/m/0g6b5,"Fireworks"
+433,/g/122z_qxw,"Firecracker"
+434,/m/07qsvvw,"Burst, pop"
+435,/m/07pxg6y,"Eruption"
+436,/m/07qqyl4,"Boom"
+437,/m/083vt,"Wood"
+438,/m/07pczhz,"Chop"
+439,/m/07pl1bw,"Splinter"
+440,/m/07qs1cx,"Crack"
+441,/m/039jq,"Glass"
+442,/m/07q7njn,"Chink, clink"
+443,/m/07rn7sz,"Shatter"
+444,/m/04k94,"Liquid"
+445,/m/07rrlb6,"Splash, splatter"
+446,/m/07p6mqd,"Slosh"
+447,/m/07qlwh6,"Squish"
+448,/m/07r5v4s,"Drip"
+449,/m/07prgkl,"Pour"
+450,/m/07pqc89,"Trickle, dribble"
+451,/t/dd00088,"Gush"
+452,/m/07p7b8y,"Fill (with liquid)"
+453,/m/07qlf79,"Spray"
+454,/m/07ptzwd,"Pump (liquid)"
+455,/m/07ptfmf,"Stir"
+456,/m/0dv3j,"Boiling"
+457,/m/0790c,"Sonar"
+458,/m/0dl83,"Arrow"
+459,/m/07rqsjt,"Whoosh, swoosh, swish"
+460,/m/07qnq_y,"Thump, thud"
+461,/m/07rrh0c,"Thunk"
+462,/m/0b_fwt,"Electronic tuner"
+463,/m/02rr_,"Effects unit"
+464,/m/07m2kt,"Chorus effect"
+465,/m/018w8,"Basketball bounce"
+466,/m/07pws3f,"Bang"
+467,/m/07ryjzk,"Slap, smack"
+468,/m/07rdhzs,"Whack, thwack"
+469,/m/07pjjrj,"Smash, crash"
+470,/m/07pc8lb,"Breaking"
+471,/m/07pqn27,"Bouncing"
+472,/m/07rbp7_,"Whip"
+473,/m/07pyf11,"Flap"
+474,/m/07qb_dv,"Scratch"
+475,/m/07qv4k0,"Scrape"
+476,/m/07pdjhy,"Rub"
+477,/m/07s8j8t,"Roll"
+478,/m/07plct2,"Crushing"
+479,/t/dd00112,"Crumpling, crinkling"
+480,/m/07qcx4z,"Tearing"
+481,/m/02fs_r,"Beep, bleep"
+482,/m/07qwdck,"Ping"
+483,/m/07phxs1,"Ding"
+484,/m/07rv4dm,"Clang"
+485,/m/07s02z0,"Squeal"
+486,/m/07qh7jl,"Creak"
+487,/m/07qwyj0,"Rustle"
+488,/m/07s34ls,"Whir"
+489,/m/07qmpdm,"Clatter"
+490,/m/07p9k1k,"Sizzle"
+491,/m/07qc9xj,"Clicking"
+492,/m/07rwm0c,"Clickety-clack"
+493,/m/07phhsh,"Rumble"
+494,/m/07qyrcz,"Plop"
+495,/m/07qfgpx,"Jingle, tinkle"
+496,/m/07rcgpl,"Hum"
+497,/m/07p78v5,"Zing"
+498,/t/dd00121,"Boing"
+499,/m/07s12q4,"Crunch"
+500,/m/028v0c,"Silence"
+501,/m/01v_m0,"Sine wave"
+502,/m/0b9m1,"Harmonic"
+503,/m/0hdsk,"Chirp tone"
+504,/m/0c1dj,"Sound effect"
+505,/m/07pt_g0,"Pulse"
+506,/t/dd00125,"Inside, small room"
+507,/t/dd00126,"Inside, large room or hall"
+508,/t/dd00127,"Inside, public space"
+509,/t/dd00128,"Outside, urban or manmade"
+510,/t/dd00129,"Outside, rural or natural"
+511,/m/01b9nn,"Reverberation"
+512,/m/01jnbd,"Echo"
+513,/m/096m7z,"Noise"
+514,/m/06_y0by,"Environmental noise"
+515,/m/07rgkc5,"Static"
+516,/m/06xkwv,"Mains hum"
+517,/m/0g12c5,"Distortion"
+518,/m/08p9q4,"Sidetone"
+519,/m/07szfh9,"Cacophony"
+520,/m/0chx_,"White noise"
+521,/m/0cj0r,"Pink noise"
+522,/m/07p_0gm,"Throbbing"
+523,/m/01jwx6,"Vibration"
+524,/m/07c52,"Television"
+525,/m/06bz3,"Radio"
+526,/m/07hvw1,"Field recording"
diff --git a/audio_detection/audio_infer/pytorch/evaluate.py b/audio_detection/audio_infer/pytorch/evaluate.py
new file mode 100644
index 0000000..7f1fa38
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/evaluate.py
@@ -0,0 +1,42 @@
+from sklearn import metrics
+
+from pytorch_utils import forward
+
+
+class Evaluator(object):
+    def __init__(self, model):
+        """Evaluator.
+
+        Args:
+          model: object
+        """
+        self.model = model
+        
+    def evaluate(self, data_loader):
+        """Forward evaluation data and calculate statistics.
+
+        Args:
+          data_loader: object
+
+        Returns:
+          statistics: dict, 
+              {'average_precision': (classes_num,), 'auc': (classes_num,)}
+        """
+
+        # Forward
+        output_dict = forward(
+            model=self.model, 
+            generator=data_loader, 
+            return_target=True)
+
+        clipwise_output = output_dict['clipwise_output']    # (audios_num, classes_num)
+        target = output_dict['target']    # (audios_num, classes_num)
+
+        average_precision = metrics.average_precision_score(
+            target, clipwise_output, average=None)
+
+        auc = metrics.roc_auc_score(target, clipwise_output, average=None)
+        
+        statistics = {'average_precision': average_precision, 'auc': auc}
+
+        return statistics
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/finetune_template.py b/audio_detection/audio_infer/pytorch/finetune_template.py
new file mode 100644
index 0000000..dd43e46
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/finetune_template.py
@@ -0,0 +1,127 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import h5py
+import math
+import time
+import logging
+import matplotlib.pyplot as plt
+
+import torch
+torch.backends.cudnn.benchmark=True
+torch.manual_seed(0)
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+ 
+from utilities import get_filename
+from models import *
+import config
+
+
+class Transfer_Cnn14(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num, freeze_base):
+        """Classifier for a new task using pretrained Cnn14 as a sub module.
+        """
+        super(Transfer_Cnn14, self).__init__()
+        audioset_classes_num = 527
+        
+        self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, 
+            fmax, audioset_classes_num)
+
+        # Transfer to another task layer
+        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)
+
+        if freeze_base:
+            # Freeze AudioSet pretrained layers
+            for param in self.base.parameters():
+                param.requires_grad = False
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.fc_transfer)
+
+    def load_from_pretrain(self, pretrained_checkpoint_path):
+        checkpoint = torch.load(pretrained_checkpoint_path)
+        self.base.load_state_dict(checkpoint['model'])
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, data_length)
+        """
+        output_dict = self.base(input, mixup_lambda)
+        embedding = output_dict['embedding']
+
+        clipwise_output =  torch.log_softmax(self.fc_transfer(embedding), dim=-1)
+        output_dict['clipwise_output'] = clipwise_output
+ 
+        return output_dict
+
+
+def train(args):
+
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    pretrained_checkpoint_path = args.pretrained_checkpoint_path
+    freeze_base = args.freeze_base
+    device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu'
+
+    classes_num = config.classes_num
+    pretrain = True if pretrained_checkpoint_path else False
+    
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax, 
+        classes_num, freeze_base)
+
+    # Load pretrained model
+    if pretrain:
+        logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path))
+        model.load_from_pretrain(pretrained_checkpoint_path)
+
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+
+    if 'cuda' in device:
+        model.to(device)
+
+    print('Load pretrained model successfully!')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    # Train
+    parser_train = subparsers.add_parser('train')
+    parser_train.add_argument('--sample_rate', type=int, required=True)
+    parser_train.add_argument('--window_size', type=int, required=True)
+    parser_train.add_argument('--hop_size', type=int, required=True)
+    parser_train.add_argument('--mel_bins', type=int, required=True)
+    parser_train.add_argument('--fmin', type=int, required=True)
+    parser_train.add_argument('--fmax', type=int, required=True) 
+    parser_train.add_argument('--model_type', type=str, required=True)
+    parser_train.add_argument('--pretrained_checkpoint_path', type=str)
+    parser_train.add_argument('--freeze_base', action='store_true', default=False)
+    parser_train.add_argument('--cuda', action='store_true', default=False)
+
+    # Parse arguments
+    args = parser.parse_args()
+    args.filename = get_filename(__file__)
+
+    if args.mode == 'train':
+        train(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/inference.py b/audio_detection/audio_infer/pytorch/inference.py
new file mode 100644
index 0000000..49dc75f
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/inference.py
@@ -0,0 +1,206 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import librosa
+import matplotlib.pyplot as plt
+import torch
+
+from utilities import create_folder, get_filename
+from models import *
+from pytorch_utils import move_data_to_device
+import config
+
+def audio_tagging(args):
+    """Inference audio tagging result of an audio clip.
+    """
+
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    checkpoint_path = args.checkpoint_path
+    audio_path = args.audio_path
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    
+    classes_num = config.classes_num
+    labels = config.labels
+
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size, 
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, 
+        classes_num=classes_num)
+    
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model'])
+
+    # Parallel
+    if 'cuda' in str(device):
+        model.to(device)
+        print('GPU number: {}'.format(torch.cuda.device_count()))
+        model = torch.nn.DataParallel(model)
+    else:
+        print('Using CPU.')
+    
+    # Load audio
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+
+    waveform = waveform[None, :]    # (1, audio_length)
+    waveform = move_data_to_device(waveform, device)
+
+    # Forward
+    with torch.no_grad():
+        model.eval()
+        batch_output_dict = model(waveform, None)
+
+    clipwise_output = batch_output_dict['clipwise_output'].data.cpu().numpy()[0]
+    """(classes_num,)"""
+
+    sorted_indexes = np.argsort(clipwise_output)[::-1]
+
+    # Print audio tagging top probabilities
+    for k in range(10):
+        print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]], 
+            clipwise_output[sorted_indexes[k]]))
+
+    # Print embedding
+    if 'embedding' in batch_output_dict.keys():
+        embedding = batch_output_dict['embedding'].data.cpu().numpy()[0]
+        print('embedding: {}'.format(embedding.shape))
+
+    return clipwise_output, labels
+
+
+def sound_event_detection(args):
+    """Inference sound event detection result of an audio clip.
+    """
+
+    # Arugments & parameters
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    checkpoint_path = args.checkpoint_path
+    audio_path = args.audio_path
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+
+    classes_num = config.classes_num
+    labels = config.labels
+    frames_per_second = sample_rate // hop_size
+
+    # Paths
+    fig_path = os.path.join('results', '{}.png'.format(get_filename(audio_path)))
+    create_folder(os.path.dirname(fig_path))
+
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size, 
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, 
+        classes_num=classes_num)
+    
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint['model'])
+
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+
+    if 'cuda' in str(device):
+        model.to(device)
+    
+    # Load audio
+    (waveform, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+
+    waveform = waveform[None, :]    # (1, audio_length)
+    waveform = move_data_to_device(waveform, device)
+
+    # Forward
+    with torch.no_grad():
+        model.eval()
+        batch_output_dict = model(waveform, None)
+
+    framewise_output = batch_output_dict['framewise_output'].data.cpu().numpy()[0]
+    """(time_steps, classes_num)"""
+
+    print('Sound event detection result (time_steps x classes_num): {}'.format(
+        framewise_output.shape))
+
+    sorted_indexes = np.argsort(np.max(framewise_output, axis=0))[::-1]
+
+    top_k = 10  # Show top results
+    top_result_mat = framewise_output[:, sorted_indexes[0 : top_k]]    
+    """(time_steps, top_k)"""
+
+    # Plot result    
+    stft = librosa.core.stft(y=waveform[0].data.cpu().numpy(), n_fft=window_size, 
+        hop_length=hop_size, window='hann', center=True)
+    frames_num = stft.shape[-1]
+
+    fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 4))
+    axs[0].matshow(np.log(np.abs(stft)), origin='lower', aspect='auto', cmap='jet')
+    axs[0].set_ylabel('Frequency bins')
+    axs[0].set_title('Log spectrogram')
+    axs[1].matshow(top_result_mat.T, origin='upper', aspect='auto', cmap='jet', vmin=0, vmax=1)
+    axs[1].xaxis.set_ticks(np.arange(0, frames_num, frames_per_second))
+    axs[1].xaxis.set_ticklabels(np.arange(0, frames_num / frames_per_second))
+    axs[1].yaxis.set_ticks(np.arange(0, top_k))
+    axs[1].yaxis.set_ticklabels(np.array(labels)[sorted_indexes[0 : top_k]])
+    axs[1].yaxis.grid(color='k', linestyle='solid', linewidth=0.3, alpha=0.3)
+    axs[1].set_xlabel('Seconds')
+    axs[1].xaxis.set_ticks_position('bottom')
+
+    plt.tight_layout()
+    plt.savefig(fig_path)
+    print('Save sound event detection visualization to {}'.format(fig_path))
+
+    return framewise_output, labels
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_at = subparsers.add_parser('audio_tagging')
+    parser_at.add_argument('--sample_rate', type=int, default=32000)
+    parser_at.add_argument('--window_size', type=int, default=1024)
+    parser_at.add_argument('--hop_size', type=int, default=320)
+    parser_at.add_argument('--mel_bins', type=int, default=64)
+    parser_at.add_argument('--fmin', type=int, default=50)
+    parser_at.add_argument('--fmax', type=int, default=14000) 
+    parser_at.add_argument('--model_type', type=str, required=True)
+    parser_at.add_argument('--checkpoint_path', type=str, required=True)
+    parser_at.add_argument('--audio_path', type=str, required=True)
+    parser_at.add_argument('--cuda', action='store_true', default=False)
+
+    parser_sed = subparsers.add_parser('sound_event_detection')
+    parser_sed.add_argument('--sample_rate', type=int, default=32000)
+    parser_sed.add_argument('--window_size', type=int, default=1024)
+    parser_sed.add_argument('--hop_size', type=int, default=320)
+    parser_sed.add_argument('--mel_bins', type=int, default=64)
+    parser_sed.add_argument('--fmin', type=int, default=50)
+    parser_sed.add_argument('--fmax', type=int, default=14000) 
+    parser_sed.add_argument('--model_type', type=str, required=True)
+    parser_sed.add_argument('--checkpoint_path', type=str, required=True)
+    parser_sed.add_argument('--audio_path', type=str, required=True)
+    parser_sed.add_argument('--cuda', action='store_true', default=False)
+    
+    args = parser.parse_args()
+
+    if args.mode == 'audio_tagging':
+        audio_tagging(args)
+
+    elif args.mode == 'sound_event_detection':
+        sound_event_detection(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/losses.py b/audio_detection/audio_infer/pytorch/losses.py
new file mode 100644
index 0000000..587e8a6
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/losses.py
@@ -0,0 +1,14 @@
+import torch
+import torch.nn.functional as F
+
+
+def clip_bce(output_dict, target_dict):
+    """Binary crossentropy loss.
+    """
+    return F.binary_cross_entropy(
+        output_dict['clipwise_output'], target_dict['target'])
+
+
+def get_loss_func(loss_type):
+    if loss_type == 'clip_bce':
+        return clip_bce
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/main.py b/audio_detection/audio_infer/pytorch/main.py
new file mode 100644
index 0000000..3582935
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/main.py
@@ -0,0 +1,378 @@
+import os
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '../utils'))
+import numpy as np
+import argparse
+import time
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data
+ 
+from utilities import (create_folder, get_filename, create_logging, Mixup, 
+    StatisticsContainer)
+from models import (PVT, PVT2, PVT_lr, PVT_nopretrain, PVT_2layer, Cnn14, Cnn14_no_specaug, Cnn14_no_dropout, 
+    Cnn6, Cnn10, ResNet22, ResNet38, ResNet54, Cnn14_emb512, Cnn14_emb128, 
+    Cnn14_emb32, MobileNetV1, MobileNetV2, LeeNet11, LeeNet24, DaiNet19, 
+    Res1dNet31, Res1dNet51, Wavegram_Cnn14, Wavegram_Logmel_Cnn14, 
+    Wavegram_Logmel128_Cnn14, Cnn14_16k, Cnn14_8k, Cnn14_mel32, Cnn14_mel128, 
+    Cnn14_mixup_time_domain, Cnn14_DecisionLevelMax, Cnn14_DecisionLevelAtt, Cnn6_Transformer, GLAM, GLAM2, GLAM3, Cnn4, EAT)
+#from models_test import (PVT_test)
+#from models1 import (PVT1)
+#from models_vig import (VIG, VIG2)
+#from models_vvt import (VVT)
+#from models2 import (MPVIT, MPVIT2)
+#from models_reshape import (PVT_reshape, PVT_tscam)
+#from models_swin import (Swin, Swin_nopretrain)
+#from models_swin2 import (Swin2)
+#from models_van import (Van, Van_tiny)
+#from models_focal import (Focal)
+#from models_cross import (Cross)
+#from models_cov import (Cov)
+#from models_cnn import (Cnn_light)
+#from models_twins import (Twins)
+#from models_cmt import (Cmt, Cmt1)
+#from models_shunted import (Shunted)
+#from models_quadtree import (Quadtree, Quadtree2, Quadtree_nopretrain)
+#from models_davit import (Davit_tscam, Davit, Davit_nopretrain)
+from pytorch_utils import (move_data_to_device, count_parameters, count_flops, 
+    do_mixup)
+from data_generator import (AudioSetDataset, TrainSampler, BalancedTrainSampler, 
+    AlternateTrainSampler, EvaluateSampler, collate_fn)
+from evaluate import Evaluator
+import config
+from losses import get_loss_func
+
+
+def train(args):
+    """Train AudioSet tagging model. 
+
+    Args:
+      dataset_dir: str
+      workspace: str
+      data_type: 'balanced_train' | 'full_train'
+      window_size: int
+      hop_size: int
+      mel_bins: int
+      model_type: str
+      loss_type: 'clip_bce'
+      balanced: 'none' | 'balanced' | 'alternate'
+      augmentation: 'none' | 'mixup'
+      batch_size: int
+      learning_rate: float
+      resume_iteration: int
+      early_stop: int
+      accumulation_steps: int
+      cuda: bool
+    """
+
+    # Arugments & parameters
+    workspace = args.workspace
+    data_type = args.data_type
+    sample_rate = args.sample_rate
+    window_size = args.window_size
+    hop_size = args.hop_size
+    mel_bins = args.mel_bins
+    fmin = args.fmin
+    fmax = args.fmax
+    model_type = args.model_type
+    loss_type = args.loss_type
+    balanced = args.balanced
+    augmentation = args.augmentation
+    batch_size = args.batch_size
+    learning_rate = args.learning_rate
+    resume_iteration = args.resume_iteration
+    early_stop = args.early_stop
+    device = torch.device('cuda') if args.cuda and torch.cuda.is_available() else torch.device('cpu')
+    filename = args.filename
+
+    num_workers = 8
+    clip_samples = config.clip_samples
+    classes_num = config.classes_num
+    loss_func = get_loss_func(loss_type)
+
+    # Paths
+    black_list_csv = None
+    
+    train_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes', 
+        '{}.h5'.format(data_type))
+
+    eval_bal_indexes_hdf5_path = os.path.join(workspace, 
+        'hdf5s', 'indexes', 'balanced_train.h5')
+
+    eval_test_indexes_hdf5_path = os.path.join(workspace, 'hdf5s', 'indexes', 
+        'eval.h5')
+
+    checkpoints_dir = os.path.join(workspace, 'checkpoints', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+    create_folder(checkpoints_dir)
+    
+    statistics_path = os.path.join(workspace, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+    create_folder(os.path.dirname(statistics_path))
+
+    logs_dir = os.path.join(workspace, 'logs', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size))
+
+    create_logging(logs_dir, filemode='w')
+    logging.info(args)
+    
+    if 'cuda' in str(device):
+        logging.info('Using GPU.')
+        device = 'cuda'
+    else:
+        logging.info('Using CPU. Set --cuda flag to use GPU.')
+        device = 'cpu'
+    
+    # Model
+    Model = eval(model_type)
+    model = Model(sample_rate=sample_rate, window_size=window_size, 
+        hop_size=hop_size, mel_bins=mel_bins, fmin=fmin, fmax=fmax, 
+        classes_num=classes_num)
+    total = sum(p.numel() for p in model.parameters())
+    print("Total params: %.2fM" % (total/1e6))
+    logging.info("Total params: %.2fM" % (total/1e6))
+    #params_num = count_parameters(model)
+    # flops_num = count_flops(model, clip_samples)
+    #logging.info('Parameters num: {}'.format(params_num))
+    # logging.info('Flops num: {:.3f} G'.format(flops_num / 1e9))
+    
+    # Dataset will be used by DataLoader later. Dataset takes a meta as input 
+    # and return a waveform and a target.
+    dataset = AudioSetDataset(sample_rate=sample_rate)
+
+    # Train sampler
+    if balanced == 'none':
+        Sampler = TrainSampler
+    elif balanced == 'balanced':
+        Sampler = BalancedTrainSampler
+    elif balanced == 'alternate':
+        Sampler = AlternateTrainSampler
+     
+    train_sampler = Sampler(
+        indexes_hdf5_path=train_indexes_hdf5_path, 
+        batch_size=batch_size * 2 if 'mixup' in augmentation else batch_size,
+        black_list_csv=black_list_csv)
+    
+    # Evaluate sampler
+    eval_bal_sampler = EvaluateSampler(
+        indexes_hdf5_path=eval_bal_indexes_hdf5_path, batch_size=batch_size)
+
+    eval_test_sampler = EvaluateSampler(
+        indexes_hdf5_path=eval_test_indexes_hdf5_path, batch_size=batch_size)
+
+    # Data loader
+    train_loader = torch.utils.data.DataLoader(dataset=dataset, 
+        batch_sampler=train_sampler, collate_fn=collate_fn, 
+        num_workers=num_workers, pin_memory=True)
+    
+    eval_bal_loader = torch.utils.data.DataLoader(dataset=dataset, 
+        batch_sampler=eval_bal_sampler, collate_fn=collate_fn, 
+        num_workers=num_workers, pin_memory=True)
+
+    eval_test_loader = torch.utils.data.DataLoader(dataset=dataset, 
+        batch_sampler=eval_test_sampler, collate_fn=collate_fn, 
+        num_workers=num_workers, pin_memory=True)
+    mix=0.5
+    if 'mixup' in augmentation:
+        mixup_augmenter = Mixup(mixup_alpha=mix)
+    print(mix)
+    logging.info(mix)
+
+    # Evaluator
+    evaluator = Evaluator(model=model)
+        
+    # Statistics
+    statistics_container = StatisticsContainer(statistics_path)
+    
+    # Optimizer
+    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.05, amsgrad=True)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4, min_lr=1e-06, verbose=True)
+    train_bgn_time = time.time()
+    
+    # Resume training
+    if resume_iteration > 0:
+        resume_checkpoint_path = os.path.join(workspace, 'checkpoints', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            '{}_iterations.pth'.format(resume_iteration))
+
+        logging.info('Loading checkpoint {}'.format(resume_checkpoint_path))
+        checkpoint = torch.load(resume_checkpoint_path)
+        model.load_state_dict(checkpoint['model'])
+        train_sampler.load_state_dict(checkpoint['sampler'])
+        statistics_container.load_state_dict(resume_iteration)
+        iteration = checkpoint['iteration']
+
+    else:
+        iteration = 0
+    
+    # Parallel
+    print('GPU number: {}'.format(torch.cuda.device_count()))
+    model = torch.nn.DataParallel(model)
+
+    if 'cuda' in str(device):
+        model.to(device)
+
+    if resume_iteration:
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        scheduler.load_state_dict(checkpoint['scheduler'])
+        print(optimizer.state_dict()['param_groups'][0]['lr'])
+
+    time1 = time.time()
+    
+    for batch_data_dict in train_loader:
+        """batch_data_dict: {
+            'audio_name': (batch_size [*2 if mixup],), 
+            'waveform': (batch_size [*2 if mixup], clip_samples), 
+            'target': (batch_size [*2 if mixup], classes_num), 
+            (ifexist) 'mixup_lambda': (batch_size * 2,)}
+        """
+        
+        # Evaluate
+        if (iteration % 2000 == 0 and iteration >= resume_iteration) or (iteration == 0):
+            train_fin_time = time.time()
+
+            bal_statistics = evaluator.evaluate(eval_bal_loader)
+            test_statistics = evaluator.evaluate(eval_test_loader)
+                            
+            logging.info('Validate bal mAP: {:.3f}'.format(
+                np.mean(bal_statistics['average_precision'])))
+
+            logging.info('Validate test mAP: {:.3f}'.format(
+                np.mean(test_statistics['average_precision'])))
+
+            statistics_container.append(iteration, bal_statistics, data_type='bal')
+            statistics_container.append(iteration, test_statistics, data_type='test')
+            statistics_container.dump()
+
+            train_time = train_fin_time - train_bgn_time
+            validate_time = time.time() - train_fin_time
+
+            logging.info(
+                'iteration: {}, train time: {:.3f} s, validate time: {:.3f} s'
+                    ''.format(iteration, train_time, validate_time))
+
+            logging.info('------------------------------------')
+
+            train_bgn_time = time.time()
+        
+        # Save model
+        if iteration % 2000 == 0:
+            checkpoint = {
+                'iteration': iteration, 
+                'model': model.module.state_dict(), 
+                'sampler': train_sampler.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'scheduler': scheduler.state_dict()}
+
+            checkpoint_path = os.path.join(
+                checkpoints_dir, '{}_iterations.pth'.format(iteration))
+                
+            torch.save(checkpoint, checkpoint_path)
+            logging.info('Model saved to {}'.format(checkpoint_path))
+        
+        # Mixup lambda
+        if 'mixup' in augmentation:
+            batch_data_dict['mixup_lambda'] = mixup_augmenter.get_lambda(
+                batch_size=len(batch_data_dict['waveform']))
+
+        # Move data to device
+        for key in batch_data_dict.keys():
+            batch_data_dict[key] = move_data_to_device(batch_data_dict[key], device)
+        
+        # Forward
+        model.train()
+
+        if 'mixup' in augmentation:
+            batch_output_dict = model(batch_data_dict['waveform'], 
+                batch_data_dict['mixup_lambda'])
+            """{'clipwise_output': (batch_size, classes_num), ...}"""
+
+            batch_target_dict = {'target': do_mixup(batch_data_dict['target'], 
+                batch_data_dict['mixup_lambda'])}
+            """{'target': (batch_size, classes_num)}"""
+        else:
+            batch_output_dict = model(batch_data_dict['waveform'], None)
+            """{'clipwise_output': (batch_size, classes_num), ...}"""
+
+            batch_target_dict = {'target': batch_data_dict['target']}
+            """{'target': (batch_size, classes_num)}"""
+
+        # Loss
+        loss = loss_func(batch_output_dict, batch_target_dict)
+        # Backward
+        loss.backward()
+        
+        optimizer.step()
+        optimizer.zero_grad()
+        
+        if iteration % 10 == 0:
+            print(iteration, loss)
+            #print('--- Iteration: {}, train time: {:.3f} s / 10 iterations ---'\
+            #    .format(iteration, time.time() - time1))
+            #time1 = time.time()
+
+        if iteration % 2000 == 0:
+            scheduler.step(np.mean(test_statistics['average_precision']))
+            print(optimizer.state_dict()['param_groups'][0]['lr'])
+            logging.info(optimizer.state_dict()['param_groups'][0]['lr'])        
+
+        # Stop learning
+        if iteration == early_stop:
+            break
+
+        iteration += 1
+        
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='Example of parser. ')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_train = subparsers.add_parser('train') 
+    parser_train.add_argument('--workspace', type=str, required=True)
+    parser_train.add_argument('--data_type', type=str, default='full_train', choices=['balanced_train', 'full_train'])
+    parser_train.add_argument('--sample_rate', type=int, default=32000)
+    parser_train.add_argument('--window_size', type=int, default=1024)
+    parser_train.add_argument('--hop_size', type=int, default=320)
+    parser_train.add_argument('--mel_bins', type=int, default=64)
+    parser_train.add_argument('--fmin', type=int, default=50)
+    parser_train.add_argument('--fmax', type=int, default=14000) 
+    parser_train.add_argument('--model_type', type=str, required=True)
+    parser_train.add_argument('--loss_type', type=str, default='clip_bce', choices=['clip_bce'])
+    parser_train.add_argument('--balanced', type=str, default='balanced', choices=['none', 'balanced', 'alternate'])
+    parser_train.add_argument('--augmentation', type=str, default='mixup', choices=['none', 'mixup'])
+    parser_train.add_argument('--batch_size', type=int, default=32)
+    parser_train.add_argument('--learning_rate', type=float, default=1e-3)
+    parser_train.add_argument('--resume_iteration', type=int, default=0)
+    parser_train.add_argument('--early_stop', type=int, default=1000000)
+    parser_train.add_argument('--cuda', action='store_true', default=False)
+    
+    args = parser.parse_args()
+    args.filename = get_filename(__file__)
+
+    if args.mode == 'train':
+        train(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/pytorch/models.py b/audio_detection/audio_infer/pytorch/models.py
new file mode 100644
index 0000000..dc225a3
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/models.py
@@ -0,0 +1,951 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+
+from audio_infer.pytorch.pytorch_utils import do_mixup, interpolate, pad_framewise_output
+import os
+import sys
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from audio_infer.pytorch.pytorch_utils import do_mixup
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+#from mmdet.models.builder import BACKBONES
+from mmdet.utils import get_root_logger
+from mmcv.runner import load_checkpoint
+os.environ['TORCH_HOME'] = '../pretrained_models'
+from copy import deepcopy
+from timm.models.helpers import load_pretrained
+from torch.cuda.amp import autocast
+from collections import OrderedDict
+import io
+import re
+from mmcv.runner import _load_checkpoint, load_state_dict
+import mmcv.runner
+import copy
+import random
+from einops import rearrange
+from einops.layers.torch import Rearrange, Reduce
+from torch import nn, einsum
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+ 
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+            
+    
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+
+
+
+class TimeShift(nn.Module):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+
+    def forward(self, x):
+        if self.training:
+            shift = torch.empty(1).normal_(self.mean, self.std).int().item()
+            x = torch.roll(x, shift, dims=2)
+        return x
+
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / time_decision.sum(
+            self.pooldim)
+
+class PVT(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()  
+        self.avgpool = nn.AdaptiveAvgPool1d(1)    
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        #clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        x = framewise_output.transpose(1, 2).contiguous()
+        x = self.avgpool(x)
+        clipwise_output = torch.flatten(x, 1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+class PVT2(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT2, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()      
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            #x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+class PVT_2layer(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT_2layer, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128],
+                                depths=[3, 4],
+                                num_heads=[1, 2],
+                                mlp_ratios=[8, 8],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=2,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        #self.temp_pool = LinearSoftPool()  
+        self.avgpool = nn.AdaptiveAvgPool1d(1)    
+        self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 8
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        #clipwise_output = torch.mean(framewise_output, dim=1)
+        #clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        x = framewise_output.transpose(1, 2).contiguous()
+        x = self.avgpool(x)
+        clipwise_output = torch.flatten(x, 1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+class PVT_lr(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT_lr, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        self.temp_pool = LinearSoftPool()      
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        #framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+
+class PVT_nopretrain(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, classes_num):
+        
+        super(PVT_nopretrain, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        self.time_shift = TimeShift(0, 10)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+ 
+        self.bn0 = nn.BatchNorm2d(64)
+        self.pvt_transformer = PyramidVisionTransformerV2(tdim=1001,
+                                fdim=64,
+                                patch_size=7,
+                                stride=4,
+                                in_chans=1,
+                                num_classes=classes_num,
+                                embed_dims=[64, 128, 320, 512],
+                                depths=[3, 4, 6, 3],
+                                num_heads=[1, 2, 5, 8],
+                                mlp_ratios=[8, 8, 4, 4],
+                                qkv_bias=True,
+                                qk_scale=None,
+                                drop_rate=0.0,
+                                drop_path_rate=0.1,
+                                sr_ratios=[8, 4, 2, 1],
+                                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                                num_stages=4,
+                                #pretrained='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'
+                                )
+        self.temp_pool = LinearSoftPool()      
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn0)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input, mixup_lambda=None):
+        """Input: (batch_size, times_steps, freq_bins)"""
+        
+        interpolate_ratio = 32
+
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        
+        if self.training:
+            x = self.time_shift(x)
+            x = self.spec_augmenter(x)
+
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        #print(x.shape)   #torch.Size([10, 1, 1001, 64])
+        x = self.pvt_transformer(x)
+        #print(x.shape)   #torch.Size([10, 800, 128])
+        x = torch.mean(x, dim=3)
+
+        x = x.transpose(1, 2).contiguous()
+        framewise_output = torch.sigmoid(self.fc_audioset(x))
+        clipwise_output = self.temp_pool(x, framewise_output).clamp(1e-7, 1.).squeeze(1)
+        #print(framewise_output.shape)    #torch.Size([10, 100, 17])
+        framewise_output = interpolate(framewise_output, interpolate_ratio)
+        framewise_output = framewise_output[:,:1000,:]
+        #framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output, 
+            'clipwise_output': clipwise_output}
+            
+        return output_dict
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1, linear=False):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+                x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            else:
+                kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Pooling(nn.Module):
+    """
+    Implementation of pooling for PoolFormer
+    --pool_size: pooling size
+    """
+    def __init__(self, pool_size=3):
+        super().__init__()
+        self.pool = nn.AvgPool2d(
+            pool_size, stride=1, padding=pool_size//2, count_include_pad=False)
+
+    def forward(self, x):
+        return self.pool(x) - x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+        #self.norm3 = norm_layer(dim)
+        #self.token_mixer = Pooling(pool_size=3)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+
+
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, tdim, fdim, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = (tdim, fdim)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // stride, img_size[1] // stride
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 3, patch_size[1] // 3))
+        self.norm = nn.LayerNorm(embed_dim)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Module):
+    def __init__(self, tdim=1001, fdim=64, patch_size=16, stride=4, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0.1, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1], num_stages=2, linear=False, pretrained=None):
+        super().__init__()
+        # self.num_classes = num_classes
+        self.depths = depths
+        self.num_stages = num_stages
+        self.linear = linear
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(tdim=tdim if i == 0 else tdim // (2 ** (i + 1)),
+                                            fdim=fdim if i == 0 else tdim // (2 ** (i + 1)),
+                                            patch_size=7 if i == 0 else 3,
+                                            stride=stride if i == 0 else 2,
+                                            in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                                            embed_dim=embed_dims[i])
+            block = nn.ModuleList([Block(
+                dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + j], norm_layer=norm_layer,
+                sr_ratio=sr_ratios[i], linear=linear)
+                for j in range(depths[i])])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+ 
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+        #self.n = nn.Linear(125, 250, bias=True)
+        # classification head
+        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+        self.init_weights(pretrained)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            #print(x.shape)
+            for blk in block:
+                x = blk(x, H, W)
+            #print(x.shape)
+            x = norm(x)
+            #if i != self.num_stages - 1:
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        #print(x.shape)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+
+        return x
+
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+
+    return out_dict
diff --git a/audio_detection/audio_infer/pytorch/pytorch_utils.py b/audio_detection/audio_infer/pytorch/pytorch_utils.py
new file mode 100644
index 0000000..a135b33
--- /dev/null
+++ b/audio_detection/audio_infer/pytorch/pytorch_utils.py
@@ -0,0 +1,251 @@
+import numpy as np
+import time
+import torch
+import torch.nn as nn
+
+
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+
+    return x.to(device)
+
+
+def do_mixup(x, mixup_lambda):
+    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
+    (1, 3, 5, ...).
+
+    Args:
+      x: (batch_size * 2, ...)
+      mixup_lambda: (batch_size * 2,)
+
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
+        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
+    return out
+    
+
+def append_to_dict(dict, key, value):
+    if key in dict.keys():
+        dict[key].append(value)
+    else:
+        dict[key] = [value]
+
+
+def forward(model, generator, return_input=False, 
+    return_target=False):
+    """Forward data to a model.
+    
+    Args: 
+      model: object
+      generator: object
+      return_input: bool
+      return_target: bool
+
+    Returns:
+      audio_name: (audios_num,)
+      clipwise_output: (audios_num, classes_num)
+      (ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
+      (ifexist) framewise_output: (audios_num, frames_num, classes_num)
+      (optional) return_input: (audios_num, segment_samples)
+      (optional) return_target: (audios_num, classes_num)
+    """
+    output_dict = {}
+    device = next(model.parameters()).device
+    time1 = time.time()
+
+    # Forward data to a model in mini-batches
+    for n, batch_data_dict in enumerate(generator):
+        print(n)
+        batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
+        
+        with torch.no_grad():
+            model.eval()
+            batch_output = model(batch_waveform)
+
+        append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])
+
+        append_to_dict(output_dict, 'clipwise_output', 
+            batch_output['clipwise_output'].data.cpu().numpy())
+
+        if 'segmentwise_output' in batch_output.keys():
+            append_to_dict(output_dict, 'segmentwise_output', 
+                batch_output['segmentwise_output'].data.cpu().numpy())
+
+        if 'framewise_output' in batch_output.keys():
+            append_to_dict(output_dict, 'framewise_output', 
+                batch_output['framewise_output'].data.cpu().numpy())
+            
+        if return_input:
+            append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
+            
+        if return_target:
+            if 'target' in batch_data_dict.keys():
+                append_to_dict(output_dict, 'target', batch_data_dict['target'])
+
+        if n % 10 == 0:
+            print(' --- Inference time: {:.3f} s / 10 iterations ---'.format(
+                time.time() - time1))
+            time1 = time.time()
+
+    for key in output_dict.keys():
+        output_dict[key] = np.concatenate(output_dict[key], axis=0)
+
+    return output_dict
+
+
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the 
+    resolution reduction in downsampling of a CNN.
+    
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+
+
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value 
+    is the same as the value of the last frame.
+
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+
+    return output
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def count_flops(model, audio_length):
+    """Count flops. Code modified from others' implementation.
+    """
+    multiply_adds = True
+    list_conv2d=[]
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0] * self.kernel_size[1] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+ 
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+ 
+        list_conv2d.append(flops)
+
+    list_conv1d=[]
+    def conv1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0] * (self.in_channels / self.groups) * (2 if multiply_adds else 1)
+        bias_ops = 1 if self.bias is not None else 0
+ 
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+ 
+        list_conv1d.append(flops)
+ 
+    list_linear=[] 
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+ 
+        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+        bias_ops = self.bias.nelement()
+ 
+        flops = batch_size * (weight_ops + bias_ops)
+        list_linear.append(flops)
+ 
+    list_bn=[] 
+    def bn_hook(self, input, output):
+        list_bn.append(input[0].nelement() * 2)
+ 
+    list_relu=[] 
+    def relu_hook(self, input, output):
+        list_relu.append(input[0].nelement() * 2)
+ 
+    list_pooling2d=[]
+    def pooling2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+ 
+        kernel_ops = self.kernel_size * self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+ 
+        list_pooling2d.append(flops)
+
+    list_pooling1d=[]
+    def pooling1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+ 
+        kernel_ops = self.kernel_size[0]
+        bias_ops = 0
+        
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+ 
+        list_pooling2d.append(flops)
+ 
+    def foo(net):
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Conv1d):
+                net.register_forward_hook(conv1d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+                net.register_forward_hook(bn_hook)
+            elif isinstance(net, nn.ReLU):
+                net.register_forward_hook(relu_hook)
+            elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+                net.register_forward_hook(pooling2d_hook)
+            elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+                net.register_forward_hook(pooling1d_hook)
+            else:
+                print('Warning: flop of module {} is not counted!'.format(net))
+            return
+        for c in childrens:
+            foo(c)
+
+    # Register hook
+    foo(model)
+    
+    device = device = next(model.parameters()).device
+    input = torch.rand(1, audio_length).to(device)
+
+    out = model(input)
+ 
+    total_flops = sum(list_conv2d) + sum(list_conv1d) + sum(list_linear) + \
+        sum(list_bn) + sum(list_relu) + sum(list_pooling2d) + sum(list_pooling1d)
+    
+    return total_flops
\ No newline at end of file
diff --git a/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png
new file mode 100644
index 0000000..3c2b5d8
Binary files /dev/null and b/audio_detection/audio_infer/results/YDlWd7Wmdi1E.png differ
diff --git a/audio_detection/audio_infer/utils/config.py b/audio_detection/audio_infer/utils/config.py
new file mode 100644
index 0000000..934be1c
--- /dev/null
+++ b/audio_detection/audio_infer/utils/config.py
@@ -0,0 +1,94 @@
+import numpy as np
+import csv
+
+sample_rate = 32000
+clip_samples = sample_rate * 10     # Audio clips are 10-second
+
+# Load label
+with open('./audio_detection/audio_infer/metadata/class_labels_indices.csv', 'r') as f:
+    reader = csv.reader(f, delimiter=',')
+    lines = list(reader)
+
+labels = []
+ids = []    # Each label has a unique id such as "/m/068hy"
+for i1 in range(1, len(lines)):
+    id = lines[i1][1]
+    label = lines[i1][2]
+    ids.append(id)
+    labels.append(label)
+
+classes_num = len(labels)
+
+lb_to_ix = {label : i for i, label in enumerate(labels)}
+ix_to_lb = {i : label for i, label in enumerate(labels)}
+
+id_to_ix = {id : i for i, id in enumerate(ids)}
+ix_to_id = {i : id for i, id in enumerate(ids)}
+
+full_samples_per_class = np.array([
+        937432,  16344,   7822,  10271,   2043,  14420,    733,   1511,
+         1258,    424,   1751,    704,    369,    590,   1063,   1375,
+         5026,    743,    853,   1648,    714,   1497,   1251,   2139,
+         1093,    133,    224,  39469,   6423,    407,   1559,   4546,
+         6826,   7464,   2468,    549,   4063,    334,    587,    238,
+         1766,    691,    114,   2153,    236,    209,    421,    740,
+          269,    959,    137,   4192,    485,   1515,    655,    274,
+           69,    157,   1128,    807,   1022,    346,     98,    680,
+          890,    352,   4169,   2061,   1753,   9883,   1339,    708,
+        37857,  18504,  12864,   2475,   2182,    757,   3624,    677,
+         1683,   3583,    444,   1780,   2364,    409,   4060,   3097,
+         3143,    502,    723,    600,    230,    852,   1498,   1865,
+         1879,   2429,   5498,   5430,   2139,   1761,   1051,    831,
+         2401,   2258,   1672,   1711,    987,    646,    794,  25061,
+         5792,   4256,     96,   8126,   2740,    752,    513,    554,
+          106,    254,   1592,    556,    331,    615,   2841,    737,
+          265,   1349,    358,   1731,   1115,    295,   1070,    972,
+          174, 937780, 112337,  42509,  49200,  11415,   6092,  13851,
+         2665,   1678,  13344,   2329,   1415,   2244,   1099,   5024,
+         9872,  10948,   4409,   2732,   1211,   1289,   4807,   5136,
+         1867,  16134,  14519,   3086,  19261,   6499,   4273,   2790,
+         8820,   1228,   1575,   4420,   3685,   2019,    664,    324,
+          513,    411,    436,   2997,   5162,   3806,   1389,    899,
+         8088,   7004,   1105,   3633,   2621,   9753,   1082,  26854,
+         3415,   4991,   2129,   5546,   4489,   2850,   1977,   1908,
+         1719,   1106,   1049,    152,    136,    802,    488,    592,
+         2081,   2712,   1665,   1128,    250,    544,    789,   2715,
+         8063,   7056,   2267,   8034,   6092,   3815,   1833,   3277,
+         8813,   2111,   4662,   2678,   2954,   5227,   1472,   2591,
+         3714,   1974,   1795,   4680,   3751,   6585,   2109,  36617,
+         6083,  16264,  17351,   3449,   5034,   3931,   2599,   4134,
+         3892,   2334,   2211,   4516,   2766,   2862,   3422,   1788,
+         2544,   2403,   2892,   4042,   3460,   1516,   1972,   1563,
+         1579,   2776,   1647,   4535,   3921,   1261,   6074,   2922,
+         3068,   1948,   4407,    712,   1294,   1019,   1572,   3764,
+         5218,    975,   1539,   6376,   1606,   6091,   1138,   1169,
+         7925,   3136,   1108,   2677,   2680,   1383,   3144,   2653,
+         1986,   1800,   1308,   1344, 122231,  12977,   2552,   2678,
+         7824,    768,   8587,  39503,   3474,    661,    430,    193,
+         1405,   1442,   3588,   6280,  10515,    785,    710,    305,
+          206,   4990,   5329,   3398,   1771,   3022,   6907,   1523,
+         8588,  12203,    666,   2113,   7916,    434,   1636,   5185,
+         1062,    664,    952,   3490,   2811,   2749,   2848,  15555,
+          363,    117,   1494,   1647,   5886,   4021,    633,   1013,
+         5951,  11343,   2324,    243,    372,    943,    734,    242,
+         3161,    122,    127,    201,   1654,    768,    134,   1467,
+          642,   1148,   2156,   1368,   1176,    302,   1909,     61,
+          223,   1812,    287,    422,    311,    228,    748,    230,
+         1876,    539,   1814,    737,    689,   1140,    591,    943,
+          353,    289,    198,    490,   7938,   1841,    850,    457,
+        814,    146,    551,    728,   1627,    620,    648,   1621,
+         2731,    535,     88,   1736,    736,    328,    293,   3170,
+          344,    384,   7640,    433,    215,    715,    626,    128,
+         3059,   1833,   2069,   3732,   1640,   1508,    836,    567,
+         2837,   1151,   2068,    695,   1494,   3173,    364,     88,
+          188,    740,    677,    273,   1533,    821,   1091,    293,
+          647,    318,   1202,    328,    532,   2847,    526,    721,
+          370,    258,    956,   1269,   1641,    339,   1322,   4485,
+          286,   1874,    277,    757,   1393,   1330,    380,    146,
+          377,    394,    318,    339,   1477,   1886,    101,   1435,
+          284,   1425,    686,    621,    221,    117,     87,   1340,
+          201,   1243,   1222,    651,   1899,    421,    712,   1016,
+         1279,    124,    351,    258,   7043,    368,    666,    162,
+         7664,    137,  70159,  26179,   6321,  32236,  33320,    771,
+         1169,    269,   1103,    444,    364,   2710,    121,    751,
+         1609,    855,   1141,   2287,   1940,   3943,    289])
diff --git a/audio_detection/audio_infer/utils/crash.py b/audio_detection/audio_infer/utils/crash.py
new file mode 100644
index 0000000..98a06e2
--- /dev/null
+++ b/audio_detection/audio_infer/utils/crash.py
@@ -0,0 +1,12 @@
+import sys
+
+class ExceptionHook:
+    instance = None
+    def __call__(self, *args, **kwargs):
+        if self.instance is None:
+            from IPython.core import ultratb
+            self.instance = ultratb.FormattedTB(mode='Plain',
+                 color_scheme='Linux', call_pdb=1)
+        return self.instance(*args, **kwargs)
+
+sys.excepthook = ExceptionHook()
diff --git a/audio_detection/audio_infer/utils/create_black_list.py b/audio_detection/audio_infer/utils/create_black_list.py
new file mode 100644
index 0000000..fadbe94
--- /dev/null
+++ b/audio_detection/audio_infer/utils/create_black_list.py
@@ -0,0 +1,64 @@
+import argparse
+import csv
+import os
+
+from utilities import create_folder
+
+
+def dcase2017task4(args):
+    """Create black list. Black list is a list of audio ids that will be 
+    skipped in training. 
+    """
+
+    # Augments & parameters
+    workspace = args.workspace
+    
+    # Black list from DCASE 2017 Task 4
+    test_weak_csv = 'metadata/black_list/groundtruth_weak_label_testing_set.csv'
+    evaluation_weak_csv = 'metadata/black_list/groundtruth_weak_label_evaluation_set.csv'
+    
+    black_list_csv = os.path.join(workspace, 'black_list', 'dcase2017task4.csv')
+    create_folder(os.path.dirname(black_list_csv))
+    
+    def get_id_sets(csv_path):
+        with open(csv_path, 'r') as fr:
+            reader = csv.reader(fr, delimiter='\t')
+            lines = list(reader)
+         
+        ids_set = [] 
+        
+        for line in lines:
+            """line: ['-5QrBL6MzLg_60.000_70.000.wav', '60.000', '70.000', 'Train horn']"""
+            ids_set.append(line[0][0 : 11])
+            
+        ids_set = list(set(ids_set))
+        return ids_set
+        
+    test_ids_set = get_id_sets(test_weak_csv)
+    evaluation_ids_set = get_id_sets(evaluation_weak_csv)
+    
+    full_ids_set = test_ids_set + evaluation_ids_set
+    
+    # Write black list
+    fw = open(black_list_csv, 'w')
+    
+    for id in full_ids_set:
+        fw.write('{}\n'.format(id))
+        
+    print('Write black list to {}'.format(black_list_csv))
+    
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_dcase2017task4 = subparsers.add_parser('dcase2017task4')
+    parser_dcase2017task4.add_argument('--workspace', type=str, required=True)
+        
+    args = parser.parse_args()
+
+    if args.mode == 'dcase2017task4':
+        dcase2017task4(args)
+        
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/create_indexes.py b/audio_detection/audio_infer/utils/create_indexes.py
new file mode 100644
index 0000000..78be38c
--- /dev/null
+++ b/audio_detection/audio_infer/utils/create_indexes.py
@@ -0,0 +1,126 @@
+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+
+from utilities import create_folder, get_sub_filepaths
+import config
+
+
+def create_indexes(args):
+    """Create indexes a for dataloader to read for training. When users have 
+    a new task and their own data, they need to create similar indexes. The 
+    indexes contain meta information of "where to find the data for training".
+    """
+
+    # Arguments & parameters
+    waveforms_hdf5_path = args.waveforms_hdf5_path
+    indexes_hdf5_path = args.indexes_hdf5_path
+
+    # Paths
+    create_folder(os.path.dirname(indexes_hdf5_path))
+
+    with h5py.File(waveforms_hdf5_path, 'r') as hr:
+        with h5py.File(indexes_hdf5_path, 'w') as hw:
+            audios_num = len(hr['audio_name'])
+            hw.create_dataset('audio_name', data=hr['audio_name'][:], dtype='S20')
+            hw.create_dataset('target', data=hr['target'][:], dtype=np.bool)
+            hw.create_dataset('hdf5_path', data=[waveforms_hdf5_path.encode()] * audios_num, dtype='S200')
+            hw.create_dataset('index_in_hdf5', data=np.arange(audios_num), dtype=np.int32)
+
+    print('Write to {}'.format(indexes_hdf5_path))
+          
+
+def combine_full_indexes(args):
+    """Combine all balanced and unbalanced indexes hdf5s to a single hdf5. This 
+    combined indexes hdf5 is used for training with full data (~20k balanced 
+    audio clips + ~1.9m unbalanced audio clips).
+    """
+
+    # Arguments & parameters
+    indexes_hdf5s_dir = args.indexes_hdf5s_dir
+    full_indexes_hdf5_path = args.full_indexes_hdf5_path
+
+    classes_num = config.classes_num
+
+    # Paths
+    paths = get_sub_filepaths(indexes_hdf5s_dir)
+    paths = [path for path in paths if (
+        'train' in path and 'full_train' not in path and 'mini' not in path)]
+
+    print('Total {} hdf5 to combine.'.format(len(paths)))
+
+    with h5py.File(full_indexes_hdf5_path, 'w') as full_hf:
+        full_hf.create_dataset(
+            name='audio_name', 
+            shape=(0,), 
+            maxshape=(None,), 
+            dtype='S20')
+        
+        full_hf.create_dataset(
+            name='target', 
+            shape=(0, classes_num), 
+            maxshape=(None, classes_num), 
+            dtype=np.bool)
+
+        full_hf.create_dataset(
+            name='hdf5_path', 
+            shape=(0,), 
+            maxshape=(None,), 
+            dtype='S200')
+
+        full_hf.create_dataset(
+            name='index_in_hdf5', 
+            shape=(0,), 
+            maxshape=(None,), 
+            dtype=np.int32)
+
+        for path in paths:
+            with h5py.File(path, 'r') as part_hf:
+                print(path)
+                n = len(full_hf['audio_name'][:])
+                new_n = n + len(part_hf['audio_name'][:])
+
+                full_hf['audio_name'].resize((new_n,))
+                full_hf['audio_name'][n : new_n] = part_hf['audio_name'][:]
+
+                full_hf['target'].resize((new_n, classes_num))
+                full_hf['target'][n : new_n] = part_hf['target'][:]
+
+                full_hf['hdf5_path'].resize((new_n,))
+                full_hf['hdf5_path'][n : new_n] = part_hf['hdf5_path'][:]
+
+                full_hf['index_in_hdf5'].resize((new_n,))
+                full_hf['index_in_hdf5'][n : new_n] = part_hf['index_in_hdf5'][:]
+                
+    print('Write combined full hdf5 to {}'.format(full_indexes_hdf5_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_create_indexes = subparsers.add_parser('create_indexes')
+    parser_create_indexes.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path of packed waveforms hdf5.')
+    parser_create_indexes.add_argument('--indexes_hdf5_path', type=str, required=True, help='Path to write out indexes hdf5.')
+
+    parser_combine_full_indexes = subparsers.add_parser('combine_full_indexes')
+    parser_combine_full_indexes.add_argument('--indexes_hdf5s_dir', type=str, required=True, help='Directory containing indexes hdf5s to be combined.')
+    parser_combine_full_indexes.add_argument('--full_indexes_hdf5_path', type=str, required=True, help='Path to write out full indexes hdf5 file.')
+
+    args = parser.parse_args()
+    
+    if args.mode == 'create_indexes':
+        create_indexes(args)
+
+    elif args.mode == 'combine_full_indexes':
+        combine_full_indexes(args)
+
+    else:
+        raise Exception('Incorrect arguments!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/data_generator.py b/audio_detection/audio_infer/utils/data_generator.py
new file mode 100644
index 0000000..b94b6d9
--- /dev/null
+++ b/audio_detection/audio_infer/utils/data_generator.py
@@ -0,0 +1,421 @@
+import numpy as np
+import h5py
+import csv
+import time
+import logging
+
+from utilities import int16_to_float32
+
+
+def read_black_list(black_list_csv):
+    """Read audio names from black list. 
+    """
+    with open(black_list_csv, 'r') as fr:
+        reader = csv.reader(fr)
+        lines = list(reader)
+
+    black_list_names = ['Y{}.wav'.format(line[0]) for line in lines]
+    return black_list_names
+
+
+class AudioSetDataset(object):
+    def __init__(self, sample_rate=32000):
+        """This class takes the meta of an audio clip as input, and return 
+        the waveform and target of the audio clip. This class is used by DataLoader. 
+        """
+        self.sample_rate = sample_rate
+    
+    def __getitem__(self, meta):
+        """Load waveform and target of an audio clip.
+        
+        Args:
+          meta: {
+            'hdf5_path': str, 
+            'index_in_hdf5': int}
+
+        Returns: 
+          data_dict: {
+            'audio_name': str, 
+            'waveform': (clip_samples,), 
+            'target': (classes_num,)}
+        """
+        hdf5_path = meta['hdf5_path']
+        index_in_hdf5 = meta['index_in_hdf5']
+        with h5py.File(hdf5_path, 'r') as hf:
+            audio_name = hf['audio_name'][index_in_hdf5].decode()
+            waveform = int16_to_float32(hf['waveform'][index_in_hdf5])
+            waveform = self.resample(waveform)
+            target = hf['target'][index_in_hdf5].astype(np.float32)
+
+        data_dict = {
+            'audio_name': audio_name, 'waveform': waveform, 'target': target}
+            
+        return data_dict
+
+    def resample(self, waveform):
+        """Resample.
+
+        Args:
+          waveform: (clip_samples,)
+
+        Returns:
+          (resampled_clip_samples,)
+        """
+        if self.sample_rate == 32000:
+            return waveform
+        elif self.sample_rate == 16000:
+            return waveform[0 :: 2]
+        elif self.sample_rate == 8000:
+            return waveform[0 :: 4]
+        else:
+            raise Exception('Incorrect sample rate!')
+
+
+class Base(object):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv, random_seed):
+        """Base class of train sampler.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        self.batch_size = batch_size
+        self.random_state = np.random.RandomState(random_seed)
+
+        # Black list
+        if black_list_csv:
+            self.black_list_names = read_black_list(black_list_csv)
+        else:
+            self.black_list_names = []
+
+        logging.info('Black list samples: {}'.format(len(self.black_list_names)))
+
+        # Load target
+        load_time = time.time()
+
+        with h5py.File(indexes_hdf5_path, 'r') as hf:
+            self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+            self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+            self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+            self.targets = hf['target'][:].astype(np.float32)
+        
+        (self.audios_num, self.classes_num) = self.targets.shape
+        logging.info('Training number: {}'.format(self.audios_num))
+        logging.info('Load target time: {:.3f} s'.format(time.time() - load_time))
+
+
+class TrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, 
+        random_seed=1234):
+        """Balanced sampler. Generate batch meta for training.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        super(TrainSampler, self).__init__(indexes_hdf5_path, batch_size, 
+            black_list_csv, random_seed)
+        
+        self.indexes = np.arange(self.audios_num)
+            
+        # Shuffle indexes
+        self.random_state.shuffle(self.indexes)
+        
+        self.pointer = 0
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int}, 
+            ...]
+        """
+        batch_size = self.batch_size
+
+        while True:
+            batch_meta = []
+            i = 0
+            while i < batch_size:
+                index = self.indexes[self.pointer]
+                self.pointer += 1
+
+                # Shuffle indexes and reset pointer
+                if self.pointer >= self.audios_num:
+                    self.pointer = 0
+                    self.random_state.shuffle(self.indexes)
+                
+                # If audio in black list then continue
+                if self.audio_names[index] in self.black_list_names:
+                    continue
+                else:
+                    batch_meta.append({
+                        'hdf5_path': self.hdf5_paths[index], 
+                        'index_in_hdf5': self.indexes_in_hdf5[index]})
+                    i += 1
+
+            yield batch_meta
+
+    def state_dict(self):
+        state = {
+            'indexes': self.indexes,
+            'pointer': self.pointer}
+        return state
+            
+    def load_state_dict(self, state):
+        self.indexes = state['indexes']
+        self.pointer = state['pointer']
+
+
+class BalancedTrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None, 
+        random_seed=1234):
+        """Balanced sampler. Generate batch meta for training. Data are equally 
+        sampled from different sound classes.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        super(BalancedTrainSampler, self).__init__(indexes_hdf5_path, 
+            batch_size, black_list_csv, random_seed)
+        
+        self.samples_num_per_class = np.sum(self.targets, axis=0)
+        logging.info('samples_num_per_class: {}'.format(
+            self.samples_num_per_class.astype(np.int32)))
+        
+        # Training indexes of all sound classes. E.g.: 
+        # [[0, 11, 12, ...], [3, 4, 15, 16, ...], [7, 8, ...], ...]
+        self.indexes_per_class = []
+        
+        for k in range(self.classes_num):
+            self.indexes_per_class.append(
+                np.where(self.targets[:, k] == 1)[0])
+            
+        # Shuffle indexes
+        for k in range(self.classes_num):
+            self.random_state.shuffle(self.indexes_per_class[k])
+        
+        self.queue = []
+        self.pointers_of_classes = [0] * self.classes_num
+
+    def expand_queue(self, queue):
+        classes_set = np.arange(self.classes_num).tolist()
+        self.random_state.shuffle(classes_set)
+        queue += classes_set
+        return queue
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int}, 
+            ...]
+        """
+        batch_size = self.batch_size
+
+        while True:
+            batch_meta = []
+            i = 0
+            while i < batch_size:
+                if len(self.queue) == 0:
+                    self.queue = self.expand_queue(self.queue)
+
+                class_id = self.queue.pop(0)
+                pointer = self.pointers_of_classes[class_id]
+                self.pointers_of_classes[class_id] += 1
+                index = self.indexes_per_class[class_id][pointer]
+                
+                # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+                if self.pointers_of_classes[class_id] >= self.samples_num_per_class[class_id]:
+                    self.pointers_of_classes[class_id] = 0
+                    self.random_state.shuffle(self.indexes_per_class[class_id])
+
+                # If audio in black list then continue
+                if self.audio_names[index] in self.black_list_names:
+                    continue
+                else:
+                    batch_meta.append({
+                        'hdf5_path': self.hdf5_paths[index], 
+                        'index_in_hdf5': self.indexes_in_hdf5[index]})
+                    i += 1
+
+            yield batch_meta
+
+    def state_dict(self):
+        state = {
+            'indexes_per_class': self.indexes_per_class, 
+            'queue': self.queue, 
+            'pointers_of_classes': self.pointers_of_classes}
+        return state
+            
+    def load_state_dict(self, state):
+        self.indexes_per_class = state['indexes_per_class']
+        self.queue = state['queue']
+        self.pointers_of_classes = state['pointers_of_classes']
+
+
+class AlternateTrainSampler(Base):
+    def __init__(self, indexes_hdf5_path, batch_size, black_list_csv=None,
+        random_seed=1234):
+        """AlternateSampler is a combination of Sampler and Balanced Sampler. 
+        AlternateSampler alternately sample data from Sampler and Blanced Sampler.
+        
+        Args:
+          indexes_hdf5_path: string          
+          batch_size: int
+          black_list_csv: string
+          random_seed: int
+        """
+        self.sampler1 = TrainSampler(indexes_hdf5_path, batch_size, 
+            black_list_csv, random_seed)
+
+        self.sampler2 = BalancedTrainSampler(indexes_hdf5_path, batch_size, 
+            black_list_csv, random_seed)
+
+        self.batch_size = batch_size
+        self.count = 0
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 'index_in_hdf5': int}, 
+            ...]
+        """
+        batch_size = self.batch_size
+
+        while True:
+            self.count += 1
+
+            if self.count % 2 == 0:
+                batch_meta = []
+                i = 0
+                while i < batch_size:
+                    index = self.sampler1.indexes[self.sampler1.pointer]
+                    self.sampler1.pointer += 1
+
+                    # Shuffle indexes and reset pointer
+                    if self.sampler1.pointer >= self.sampler1.audios_num:
+                        self.sampler1.pointer = 0
+                        self.sampler1.random_state.shuffle(self.sampler1.indexes)
+                    
+                    # If audio in black list then continue
+                    if self.sampler1.audio_names[index] in self.sampler1.black_list_names:
+                        continue
+                    else:
+                        batch_meta.append({
+                            'hdf5_path': self.sampler1.hdf5_paths[index], 
+                            'index_in_hdf5': self.sampler1.indexes_in_hdf5[index]})
+                        i += 1
+
+            elif self.count % 2 == 1:
+                batch_meta = []
+                i = 0
+                while i < batch_size:
+                    if len(self.sampler2.queue) == 0:
+                        self.sampler2.queue = self.sampler2.expand_queue(self.sampler2.queue)
+
+                    class_id = self.sampler2.queue.pop(0)
+                    pointer = self.sampler2.pointers_of_classes[class_id]
+                    self.sampler2.pointers_of_classes[class_id] += 1
+                    index = self.sampler2.indexes_per_class[class_id][pointer]
+                    
+                    # When finish one epoch of a sound class, then shuffle its indexes and reset pointer
+                    if self.sampler2.pointers_of_classes[class_id] >= self.sampler2.samples_num_per_class[class_id]:
+                        self.sampler2.pointers_of_classes[class_id] = 0
+                        self.sampler2.random_state.shuffle(self.sampler2.indexes_per_class[class_id])
+
+                    # If audio in black list then continue
+                    if self.sampler2.audio_names[index] in self.sampler2.black_list_names:
+                        continue
+                    else:
+                        batch_meta.append({
+                            'hdf5_path': self.sampler2.hdf5_paths[index], 
+                            'index_in_hdf5': self.sampler2.indexes_in_hdf5[index]})
+                        i += 1
+
+            yield batch_meta
+
+    def state_dict(self):
+        state = {
+            'sampler1': self.sampler1.state_dict(), 
+            'sampler2': self.sampler2.state_dict()}
+        return state
+
+    def load_state_dict(self, state):
+        self.sampler1.load_state_dict(state['sampler1'])
+        self.sampler2.load_state_dict(state['sampler2'])
+
+
+class EvaluateSampler(object):
+    def __init__(self, indexes_hdf5_path, batch_size):
+        """Evaluate sampler. Generate batch meta for evaluation.
+        
+        Args:
+          indexes_hdf5_path: string
+          batch_size: int
+        """
+        self.batch_size = batch_size
+
+        with h5py.File(indexes_hdf5_path, 'r') as hf:
+            self.audio_names = [audio_name.decode() for audio_name in hf['audio_name'][:]]
+            self.hdf5_paths = [hdf5_path.decode() for hdf5_path in hf['hdf5_path'][:]]
+            self.indexes_in_hdf5 = hf['index_in_hdf5'][:]
+            self.targets = hf['target'][:].astype(np.float32)
+            
+        self.audios_num = len(self.audio_names)
+
+    def __iter__(self):
+        """Generate batch meta for training. 
+        
+        Returns:
+          batch_meta: e.g.: [
+            {'hdf5_path': string, 
+             'index_in_hdf5': int}
+            ...]
+        """
+        batch_size = self.batch_size
+        pointer = 0
+
+        while pointer < self.audios_num:
+            batch_indexes = np.arange(pointer, 
+                min(pointer + batch_size, self.audios_num))
+
+            batch_meta = []
+
+            for index in batch_indexes:
+                batch_meta.append({
+                    'audio_name': self.audio_names[index], 
+                    'hdf5_path': self.hdf5_paths[index], 
+                    'index_in_hdf5': self.indexes_in_hdf5[index], 
+                    'target': self.targets[index]})
+
+            pointer += batch_size
+            yield batch_meta
+
+
+def collate_fn(list_data_dict):
+    """Collate data.
+    Args:
+      list_data_dict, e.g., [{'audio_name': str, 'waveform': (clip_samples,), ...}, 
+                             {'audio_name': str, 'waveform': (clip_samples,), ...},
+                             ...]
+    Returns:
+      np_data_dict, dict, e.g.,
+          {'audio_name': (batch_size,), 'waveform': (batch_size, clip_samples), ...}
+    """
+    np_data_dict = {}
+    
+    for key in list_data_dict[0].keys():
+        np_data_dict[key] = np.array([data_dict[key] for data_dict in list_data_dict])
+    
+    return np_data_dict
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/dataset.py b/audio_detection/audio_infer/utils/dataset.py
new file mode 100644
index 0000000..c7f1175
--- /dev/null
+++ b/audio_detection/audio_infer/utils/dataset.py
@@ -0,0 +1,224 @@
+import numpy as np
+import argparse
+import csv
+import os
+import glob
+import datetime
+import time
+import logging
+import h5py
+import librosa
+
+from utilities import (create_folder, get_filename, create_logging, 
+    float32_to_int16, pad_or_truncate, read_metadata)
+import config
+
+
+def split_unbalanced_csv_to_partial_csvs(args):
+    """Split unbalanced csv to part csvs. Each part csv contains up to 50000 ids. 
+    """
+    
+    unbalanced_csv_path = args.unbalanced_csv
+    unbalanced_partial_csvs_dir = args.unbalanced_partial_csvs_dir
+    
+    create_folder(unbalanced_partial_csvs_dir)
+    
+    with open(unbalanced_csv_path, 'r') as f:
+        lines = f.readlines()
+
+    lines = lines[3:]   # Remove head info
+    audios_num_per_file = 50000
+    
+    files_num = int(np.ceil(len(lines) / float(audios_num_per_file)))
+    
+    for r in range(files_num):
+        lines_per_file = lines[r * audios_num_per_file : 
+            (r + 1) * audios_num_per_file]
+        
+        out_csv_path = os.path.join(unbalanced_partial_csvs_dir, 
+            'unbalanced_train_segments_part{:02d}.csv'.format(r))
+
+        with open(out_csv_path, 'w') as f:
+            f.write('empty\n')
+            f.write('empty\n')
+            f.write('empty\n')
+            for line in lines_per_file:
+                f.write(line)
+        
+        print('Write out csv to {}'.format(out_csv_path))
+
+
+def download_wavs(args):
+    """Download videos and extract audio in wav format.
+    """
+
+    # Paths
+    csv_path = args.csv_path
+    audios_dir = args.audios_dir
+    mini_data = args.mini_data
+    
+    if mini_data:
+        logs_dir = '_logs/download_dataset/{}'.format(get_filename(csv_path))
+    else:
+        logs_dir = '_logs/download_dataset_minidata/{}'.format(get_filename(csv_path))
+    
+    create_folder(audios_dir)
+    create_folder(logs_dir)
+    create_logging(logs_dir, filemode='w')
+    logging.info('Download log is saved to {}'.format(logs_dir))
+
+    # Read csv
+    with open(csv_path, 'r') as f:
+        lines = f.readlines()
+    
+    lines = lines[3:]   # Remove csv head info
+
+    if mini_data:
+        lines = lines[0 : 10]   # Download partial data for debug
+    
+    download_time = time.time()
+
+    # Download
+    for (n, line) in enumerate(lines):
+        
+        items = line.split(', ')
+        audio_id = items[0]
+        start_time = float(items[1])
+        end_time = float(items[2])
+        duration = end_time - start_time
+        
+        logging.info('{} {} start_time: {:.1f}, end_time: {:.1f}'.format(
+            n, audio_id, start_time, end_time))
+        
+        # Download full video of whatever format
+        video_name = os.path.join(audios_dir, '_Y{}.%(ext)s'.format(audio_id))
+        os.system("youtube-dl --quiet -o '{}' -x https://www.youtube.com/watch?v={}"\
+            .format(video_name, audio_id))
+
+        video_paths = glob.glob(os.path.join(audios_dir, '_Y' + audio_id + '.*'))
+
+        # If download successful
+        if len(video_paths) > 0:
+            video_path = video_paths[0]     # Choose one video
+
+            # Add 'Y' to the head because some video ids are started with '-'
+            # which will cause problem
+            audio_path = os.path.join(audios_dir, 'Y' + audio_id + '.wav')
+
+            # Extract audio in wav format
+            os.system("ffmpeg -loglevel panic -i {} -ac 1 -ar 32000 -ss {} -t 00:00:{} {} "\
+                .format(video_path, 
+                str(datetime.timedelta(seconds=start_time)), duration, 
+                audio_path))
+            
+            # Remove downloaded video
+            os.system("rm {}".format(video_path))
+            
+            logging.info("Download and convert to {}".format(audio_path))
+                
+    logging.info('Download finished! Time spent: {:.3f} s'.format(
+        time.time() - download_time))
+
+    logging.info('Logs can be viewed in {}'.format(logs_dir))
+
+
+def pack_waveforms_to_hdf5(args):
+    """Pack waveform and target of several audio clips to a single hdf5 file. 
+    This can speed up loading and training.
+    """
+
+    # Arguments & parameters
+    audios_dir = args.audios_dir
+    csv_path = args.csv_path
+    waveforms_hdf5_path = args.waveforms_hdf5_path
+    mini_data = args.mini_data
+
+    clip_samples = config.clip_samples
+    classes_num = config.classes_num
+    sample_rate = config.sample_rate
+    id_to_ix = config.id_to_ix
+
+    # Paths
+    if mini_data:
+        prefix = 'mini_'
+        waveforms_hdf5_path += '.mini'
+    else:
+        prefix = ''
+
+    create_folder(os.path.dirname(waveforms_hdf5_path))
+
+    logs_dir = '_logs/pack_waveforms_to_hdf5/{}{}'.format(prefix, get_filename(csv_path))
+    create_folder(logs_dir)
+    create_logging(logs_dir, filemode='w')
+    logging.info('Write logs to {}'.format(logs_dir))
+    
+    # Read csv file
+    meta_dict = read_metadata(csv_path, classes_num, id_to_ix)
+
+    if mini_data:
+        mini_num = 10
+        for key in meta_dict.keys():
+            meta_dict[key] = meta_dict[key][0 : mini_num]
+
+    audios_num = len(meta_dict['audio_name'])
+
+    # Pack waveform to hdf5
+    total_time = time.time()
+
+    with h5py.File(waveforms_hdf5_path, 'w') as hf:
+        hf.create_dataset('audio_name', shape=((audios_num,)), dtype='S20')
+        hf.create_dataset('waveform', shape=((audios_num, clip_samples)), dtype=np.int16)
+        hf.create_dataset('target', shape=((audios_num, classes_num)), dtype=np.bool)
+        hf.attrs.create('sample_rate', data=sample_rate, dtype=np.int32)
+
+        # Pack waveform & target of several audio clips to a single hdf5 file
+        for n in range(audios_num):
+            audio_path = os.path.join(audios_dir, meta_dict['audio_name'][n])
+
+            if os.path.isfile(audio_path):
+                logging.info('{} {}'.format(n, audio_path))
+                (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
+                audio = pad_or_truncate(audio, clip_samples)
+
+                hf['audio_name'][n] = meta_dict['audio_name'][n].encode()
+                hf['waveform'][n] = float32_to_int16(audio)
+                hf['target'][n] = meta_dict['target'][n]
+            else:
+                logging.info('{} File does not exist! {}'.format(n, audio_path))
+
+    logging.info('Write to {}'.format(waveforms_hdf5_path))
+    logging.info('Pack hdf5 time: {:.3f}'.format(time.time() - total_time))
+          
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest='mode')
+
+    parser_split = subparsers.add_parser('split_unbalanced_csv_to_partial_csvs')
+    parser_split.add_argument('--unbalanced_csv', type=str, required=True, help='Path of unbalanced_csv file to read.')
+    parser_split.add_argument('--unbalanced_partial_csvs_dir', type=str, required=True, help='Directory to save out split unbalanced partial csv.')
+
+    parser_download_wavs = subparsers.add_parser('download_wavs')
+    parser_download_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+    parser_download_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+    parser_download_wavs.add_argument('--mini_data', action='store_true', default=True, help='Set true to only download 10 audios for debugging.')
+
+    parser_pack_wavs = subparsers.add_parser('pack_waveforms_to_hdf5')
+    parser_pack_wavs.add_argument('--csv_path', type=str, required=True, help='Path of csv file containing audio info to be downloaded.')
+    parser_pack_wavs.add_argument('--audios_dir', type=str, required=True, help='Directory to save out downloaded audio.')
+    parser_pack_wavs.add_argument('--waveforms_hdf5_path', type=str, required=True, help='Path to save out packed hdf5.')
+    parser_pack_wavs.add_argument('--mini_data', action='store_true', default=False, help='Set true to only download 10 audios for debugging.')
+
+    args = parser.parse_args()
+    
+    if args.mode == 'split_unbalanced_csv_to_partial_csvs':
+        split_unbalanced_csv_to_partial_csvs(args)
+    
+    elif args.mode == 'download_wavs':
+        download_wavs(args)
+
+    elif args.mode == 'pack_waveforms_to_hdf5':
+        pack_waveforms_to_hdf5(args)
+
+    else:
+        raise Exception('Incorrect arguments!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/plot_for_paper.py b/audio_detection/audio_infer/utils/plot_for_paper.py
new file mode 100644
index 0000000..25e799a
--- /dev/null
+++ b/audio_detection/audio_infer/utils/plot_for_paper.py
@@ -0,0 +1,565 @@
+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+
+from utilities import (create_folder, get_filename, d_prime)
+import config
+
+
+def load_statistics(statistics_path):
+    statistics_dict = pickle.load(open(statistics_path, 'rb'))
+
+    bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+    bal_map = np.mean(bal_map, axis=-1)
+    test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+    test_map = np.mean(test_map, axis=-1)
+
+    return bal_map, test_map
+
+
+def crop_label(label):
+    max_len = 16
+    if len(label) <= max_len:
+        return label
+    else:
+        words = label.split(' ')
+        cropped_label = ''
+        for w in words:
+            if len(cropped_label + ' ' + w) > max_len:
+                break
+            else:
+                cropped_label += ' {}'.format(w)
+    return cropped_label
+
+
+def add_comma(integer):
+    """E.g., 1234567 -> 1,234,567
+    """
+    integer = int(integer)
+    if integer >= 1000:
+        return str(integer // 1000) + ',' + str(integer % 1000)
+    else:
+        return str(integer)
+
+
+def plot_classwise_iteration_map(args):
+    
+    # Paths
+    save_out_path = 'results/classwise_iteration_map.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    # Load statistics
+    statistics_dict = pickle.load(open('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl', 'rb'))
+
+    mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+    mAP_mat = mAP_mat[0 : 300, :]   # 300 * 2000 = 600k iterations
+    sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+
+    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+    ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+    axs[0].set_ylabel('AP')
+
+    for col in range(0, 3):
+        axs[col].set_ylim(0, 1.)
+        axs[col].set_xlim(0, 301)
+        axs[col].set_xlabel('Iterations')
+        axs[col].set_ylabel('AP')
+        axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+        axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+        lines = []
+        for _ix in ranges[col]:
+            _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+                ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+            line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+            lines.append(line)
+        box = axs[col].get_position()
+        axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+        axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+        axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ 
+    plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+    plt.savefig(save_out_path)
+    print(save_out_path)
+
+
+def plot_six_figures(args):
+    
+    # Arguments & parameters
+    classes_num = config.classes_num
+    labels = config.labels
+    max_plot_iteration = 540000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    # Paths
+    class_labels_indices_path = os.path.join('metadata', 'class_labels_indices.csv')
+    save_out_path = 'results/six_figures.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Plot
+    fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    linewidth = 1.
+
+    # (a) Comparison of architectures
+    if True:
+        lines = []
+
+        # Wavegram-Logmel-CNN
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_WavegramLogmelCnn_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # MobileNetV1
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_MobileNetV1_balanced_mixup_bs32.pkl')
+        line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 0].legend(handles=lines, loc=2)
+        ax[0, 0].set_title('(a) Comparison of architectures')
+
+    # (b) Comparison of training data and augmentation'
+    if True:
+        lines = []
+
+        # Full data + balanced sampler + mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Full data + balanced sampler + mixup in time domain
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_timedomain_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Full data + balanced sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Full data + uniform sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_nobalanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Balanced data + balanced sampler + mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Balanced data + balanced sampler + no mixup
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_balanced_train_Cnn14_balanced_nomixup_bs32.pkl')
+        line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+        ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+
+    # (c) Comparison of embedding size
+    if True:
+        lines = []
+
+        # Embedding size 2048
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Embedding size 128
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb128_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Embedding size 32
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_emb32_balanced_mixup_bs32.pkl')
+        line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 2].legend(handles=lines, loc=2)
+        ax[0, 2].set_title('(c) Comparison of embedding size')
+
+    # (d) Comparison of amount of training data
+    if True:
+        lines = []
+
+        # 100% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # 80% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.8full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # 50% of full training data
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_0.5full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 0].legend(handles=lines, loc=2)
+        ax[1, 0].set_title('(d) Comparison of amount of training data')
+
+    # (e) Comparison of sampling rate
+    if True:
+        lines = []
+
+        # Cnn14 + 32 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 16 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_16k_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 8 kHz
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_8k_balanced_mixup_bs32.pkl')
+        line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 1].legend(handles=lines, loc=2)
+        ax[1, 1].set_title('(e) Comparison of sampling rate')
+
+    # (f) Comparison of mel bins number
+    if True:
+        lines = []
+
+        # Cnn14 + 128 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel128_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 64 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel64_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # Cnn14 + 32 mel bins
+        (bal_map, test_map) = load_statistics('paper_statistics/statistics_sr32000_window1024_hop320_mel32_fmin50_fmax14000_full_train_Cnn14_balanced_mixup_bs32.pkl')
+        line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 2].legend(handles=lines, loc=2)
+        ax[1, 2].set_title('(f) Comparison of mel bins number')
+
+    for i in range(2):
+        for j in range(3):
+            ax[i, j].set_ylim(0, 0.8)
+            ax[i, j].set_xlim(0, len(iterations))
+            ax[i, j].set_xlabel('Iterations')
+            ax[i, j].set_ylabel('mAP')
+            ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+            ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+            ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+            ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', 
+                '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+            ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+            ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+    plt.tight_layout(0, 1, 0)
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def plot_complexity_map(args):
+    
+    # Paths
+    save_out_path = 'results/complexity_mAP.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    plt.figure(figsize=(5, 5))
+    fig, ax = plt.subplots(1, 1)
+
+    model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54', 
+        'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18', 
+        'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+    flops = np.array([21.986, 28.166, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810, 
+        30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+    mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295, 
+        0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+
+    sorted_indexes = np.sort(flops)
+    ax.scatter(flops, mAPs)
+
+    shift = [[-5.5, -0.004], [1, -0.004], [-1, -0.014], [-2, 0.006], [-7, 0.006], 
+        [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008], 
+        [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+
+    for i, model_type in enumerate(model_types):
+        ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+
+    ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+    ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+    ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+    ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+    ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+    ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+
+    ax.set_xlim(0, 70)
+    ax.set_ylim(0.2, 0.5)
+    ax.set_xlabel('Multi-load_statisticss (million)', fontsize=15)
+    ax.set_ylabel('mAP', fontsize=15)
+    ax.tick_params(axis='x', labelsize=12)
+    ax.tick_params(axis='y', labelsize=12)
+
+    plt.tight_layout(0, 0, 0)
+
+    plt.savefig(save_out_path)
+    print('Write out figure to {}'.format(save_out_path))
+
+
+def plot_long_fig(args):
+    
+    # Paths
+    stats = pickle.load(open('paper_statistics/stats_for_long_fig.pkl', 'rb'))
+
+    save_out_path = 'results/long_fig.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    # Load meta
+    N = len(config.labels)
+    sorted_indexes = stats['sorted_indexes_for_plot']
+    sorted_labels = np.array(config.labels)[sorted_indexes]
+    audio_clips_per_class = stats['official_balanced_training_samples'] + stats['official_unbalanced_training_samples']
+    audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+
+    # Prepare axes for plot
+    (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+ 
+    # plot the number of training samples
+    ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+   
+    # Load mAP of different systems
+    """Average instance system of [1] with an mAP of 0.317.
+    [1] Kong, Qiuqiang, Changsong Yu, Yong Xu, Turab Iqbal, Wenwu Wang, and 
+    Mark D. Plumbley. "Weakly labelled audioset tagging with attention neural 
+    networks." IEEE/ACM Transactions on Audio, Speech, and Language Processing 
+    27, no. 11 (2019): 1791-1802."""
+    maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+    maps_avg_instances = maps_avg_instances[sorted_indexes]
+
+    # PANNs Cnn14
+    maps_panns_cnn14 = stats['panns_cnn14']['eval']['average_precision']
+    maps_panns_cnn14 = maps_panns_cnn14[sorted_indexes]
+
+    # PANNs MobileNetV1
+    maps_panns_mobilenetv1 = stats['panns_mobilenetv1']['eval']['average_precision']
+    maps_panns_mobilenetv1 = maps_panns_mobilenetv1[sorted_indexes]
+
+    # PANNs Wavegram-Logmel-Cnn14
+    maps_panns_wavegram_logmel_cnn14 = stats['panns_wavegram_logmel_cnn14']['eval']['average_precision']
+    maps_panns_wavegram_logmel_cnn14 = maps_panns_wavegram_logmel_cnn14[sorted_indexes]
+
+    # Plot mAPs
+    _scatter_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+    _scatter_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+    _scatter_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+    _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+    
+    linewidth = 0.7
+    line0te = _plot_4_rows(maps_panns_wavegram_logmel_cnn14, ax1b, ax2b, ax3b, ax4b, 
+        c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+    line1te = _plot_4_rows(maps_panns_cnn14, ax1b, ax2b, ax3b, ax4b, c='r', 
+        linewidth=linewidth, label='AP with CNN14')
+    line2te = _plot_4_rows(maps_panns_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', 
+        linewidth=linewidth, label='AP with MobileNetV1')
+    line3te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', 
+        linewidth=linewidth, label='AP with averaging instances (baseline)')
+
+    # Plot label quality
+    label_quality = stats['label_quality']
+    sorted_label_quality = np.array(label_quality)[sorted_indexes]
+    for k in range(len(sorted_label_quality)):
+        if sorted_label_quality[k] and sorted_label_quality[k] == 1:
+            sorted_label_quality[k] = 0.99
+    
+    ax1b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax2b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax3b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+')
+    line_label_quality = ax4b.scatter(np.arange(N)[sorted_label_quality != None], 
+        sorted_label_quality[sorted_label_quality != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+    ax1b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax2b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax3b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax4b.scatter(np.arange(N)[sorted_label_quality == None], 
+        0.5 * np.ones(len(np.arange(N)[sorted_label_quality == None])), s=12, c='r', linewidth=0.8, marker='_')
+    
+    plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+    plt.tight_layout(0, 0, 0)
+    plt.savefig(save_out_path)
+    print('Save fig to {}'.format(save_out_path))
+
+
+def prepare_plot_long_4_rows(sorted_lbs):
+    N = len(sorted_lbs)
+
+    f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1, sharey=False, facecolor='w', figsize=(10, 10.5))
+
+    fontsize = 5
+
+    K = 132
+    ax1a.set_xlim(0, K)
+    ax2a.set_xlim(K, 2 * K)
+    ax3a.set_xlim(2 * K, 3 * K)
+    ax4a.set_xlim(3 * K, N)
+    
+    truncated_sorted_lbs = []
+    for lb in sorted_lbs:
+        lb = lb[0 : 25]
+        words = lb.split(' ')
+        if len(words[-1]) < 3:
+            lb = ' '.join(words[0:-1])
+        truncated_sorted_lbs.append(lb)
+  
+    ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    
+    ax1a.set_yscale('log')
+    ax2a.set_yscale('log')
+    ax3a.set_yscale('log')
+    ax4a.set_yscale('log')
+    
+    ax1b = ax1a.twinx()
+    ax2b = ax2a.twinx()
+    ax3b = ax3a.twinx()
+    ax4b = ax4a.twinx()
+    ax1b.set_ylim(0., 1.)
+    ax2b.set_ylim(0., 1.)
+    ax3b.set_ylim(0., 1.)
+    ax4b.set_ylim(0., 1.)
+    ax1b.set_ylabel('Average precision')
+    ax2b.set_ylabel('Average precision')
+    ax3b.set_ylabel('Average precision')
+    ax4b.set_ylabel('Average precision')
+    
+    ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    
+    ax1a.xaxis.set_ticks(np.arange(K))
+    ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+    ax1a.xaxis.tick_bottom()
+    ax1a.set_ylabel("Number of audio clips")
+    
+    ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+    ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+    ax2a.xaxis.tick_bottom()
+    ax2a.set_ylabel("Number of audio clips")
+    
+    ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+    ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+    ax3a.xaxis.tick_bottom()
+    ax3a.set_ylabel("Number of audio clips")
+    
+    ax4a.xaxis.set_ticks(np.arange(3*K, N))
+    ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+    ax4a.xaxis.tick_bottom()
+    ax4a.set_ylabel("Number of audio clips")
+    
+    ax1a.spines['right'].set_visible(False)
+    ax1b.spines['right'].set_visible(False)
+    ax2a.spines['left'].set_visible(False)
+    ax2b.spines['left'].set_visible(False)
+    ax2a.spines['right'].set_visible(False)
+    ax2b.spines['right'].set_visible(False)
+    ax3a.spines['left'].set_visible(False)
+    ax3b.spines['left'].set_visible(False)
+    ax3a.spines['right'].set_visible(False)
+    ax3b.spines['right'].set_visible(False)
+    ax4a.spines['left'].set_visible(False)
+    ax4b.spines['left'].set_visible(False)
+    
+    plt.subplots_adjust(hspace = 0.8)
+    
+    return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+
+
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+    N = len(x)
+    ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+
+
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+    N = len(x)
+    ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+    return line
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+    
+    parser_classwise_iteration_map = subparsers.add_parser('plot_classwise_iteration_map')
+    parser_six_figures = subparsers.add_parser('plot_six_figures')
+    parser_complexity_map = subparsers.add_parser('plot_complexity_map')
+    parser_long_fig = subparsers.add_parser('plot_long_fig')
+    
+    args = parser.parse_args()
+
+    if args.mode == 'plot_classwise_iteration_map':
+        plot_classwise_iteration_map(args)
+
+    elif args.mode == 'plot_six_figures':
+        plot_six_figures(args)
+    
+    elif args.mode == 'plot_complexity_map':
+        plot_complexity_map(args)
+
+    elif args.mode == 'plot_long_fig':
+        plot_long_fig(args)
+
+    else:
+    	raise Exception('Incorrect argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/plot_statistics.py b/audio_detection/audio_infer/utils/plot_statistics.py
new file mode 100644
index 0000000..3ea9f14
--- /dev/null
+++ b/audio_detection/audio_infer/utils/plot_statistics.py
@@ -0,0 +1,2034 @@
+import os
+import sys
+import numpy as np
+import argparse
+import h5py
+import time
+import _pickle as cPickle
+import _pickle
+import matplotlib.pyplot as plt
+import csv
+from sklearn import metrics
+
+from utilities import (create_folder, get_filename, d_prime)
+import config
+
+
+def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+    statistics_path = os.path.join(workspace0, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+    bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+    bal_map = np.mean(bal_map, axis=-1)
+    test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+    test_map = np.mean(test_map, axis=-1)
+    legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+    # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+    return bal_map, test_map, legend
+
+
+def _load_metrics0_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+    statistics_path = os.path.join(workspace0, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+    return statistics_dict['test'][300]['average_precision']
+
+
+def _load_metrics0_classwise2(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+    statistics_path = os.path.join(workspace0, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+    k = 270
+    mAP = np.mean(statistics_dict['test'][k]['average_precision'])
+    mAUC = np.mean(statistics_dict['test'][k]['auc'])
+    dprime = d_prime(mAUC)
+    return mAP, mAUC, dprime
+
+
+def _load_metrics_classwise(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+    fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+    workspace = '/mnt/cephfs_new_wj/speechsv/kongqiuqiang/workspaces/cvssp/pub_audioset_tagging_cnn'
+    statistics_path = os.path.join(workspace, 'statistics', filename, 
+        'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+        sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+        'data_type={}'.format(data_type), model_type, 
+        'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+        'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+        'statistics.pkl')
+
+    statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+    
+    k = 300
+    mAP = np.mean(statistics_dict['test'][k]['average_precision'])
+    mAUC = np.mean(statistics_dict['test'][k]['auc'])
+    dprime = d_prime(mAUC)
+    return mAP, mAUC, dprime
+
+
+def plot(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    select = args.select
+    
+    classes_num = config.classes_num
+    max_plot_iteration = 1000000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    class_labels_indices_path = os.path.join(dataset_dir, 'metadata', 
+        'class_labels_indices.csv')
+        
+    save_out_path = 'results/{}.pdf'.format(select)
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Read labels
+    labels = config.labels
+    
+    # Plot
+    fig, ax = plt.subplots(1, 1, figsize=(15, 8))
+    lines = []
+        
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+        
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    lines = []
+
+    if select == '1_cnn13':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_no_dropout', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_no_specaug', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_no_dropout', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_no_mixup', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_mixup_in_wave', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_mixup_in_wave', color='c', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_pooling':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_gwrp', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_gmpgapgwrp', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_att', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_gmpgapatt', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_resnet':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='ResNet18', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='resnet34', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='resnet50', color='c', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_densenet':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'DenseNet121', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='densenet121', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'DenseNet201', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='densenet201', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_cnn9':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn5', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn9', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_hop':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_hop500', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_hop640', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_hop1000', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_emb':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_emb32', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_emb128', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13_emb512', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_mobilenet':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='mobilenetv1', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='mobilenetv2', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_waveform':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_LeeNet', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_LeeNet18', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_DaiNet', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='c', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_ResNet50', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_waveform_cnn2d':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_decision_level':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_DecisionLevelMax', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_DecisionLevelAvg', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_DecisionLevelAtt', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_transformer':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_Transformer1', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer3', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_Transformer3', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_Transformer6', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_Transformer6', color='k', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_aug':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_bal_train_aug':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,none,none', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,none', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup_from_0_epoch', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,balanced,mixup_from_0_epoch', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_sr':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_16k', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_8k', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_time_domain':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_time_domain', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_partial_full':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.8', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,partial_0.5', color='m', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_window':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 2048, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_win2048', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_melbins':
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_mel32', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_mel128', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '1_alternate':
+        max_plot_iteration = 2000000
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'alternate', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14_alternate', color='b', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '2_all':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='MobileNetV1', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='ResNet34', color='grey', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_WavCnn2d', color='m', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_SpAndWav', color='orange', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '2_emb':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_emb32', color='r', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_128', color='k', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha)
+        lines.append(line)
+
+    elif select == '2_aug':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn13', color='b', alpha=test_alpha)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+                320, 64, 50, 14000, 'full_train', 'Cnn13_no_specaug', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='c', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='cnn14,none,none', color='c', alpha=test_alpha)
+        lines.append(line)
+
+        
+
+    ax.set_ylim(0, 1.)
+    ax.set_xlim(0, len(iterations))
+    ax.xaxis.set_ticks(np.arange(0, len(iterations), 25))
+    ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+    ax.yaxis.set_ticks(np.arange(0, 1.01, 0.05))
+    ax.yaxis.set_ticklabels(np.around(np.arange(0, 1.01, 0.05), decimals=2))        
+    ax.grid(color='b', linestyle='solid', linewidth=0.3)
+    plt.legend(handles=lines, loc=2)
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def plot_for_paper(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    select = args.select
+    
+    classes_num = config.classes_num
+    max_plot_iteration = 1000000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    class_labels_indices_path = os.path.join(dataset_dir, 'metadata', 
+        'class_labels_indices.csv')
+        
+    save_out_path = 'results/paper_{}.pdf'.format(select)
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Read labels
+    labels = config.labels
+    
+    # Plot
+    fig, ax = plt.subplots(1, 1, figsize=(6, 4))
+    lines = []
+        
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    lines = []
+    linewidth = 1.
+
+    max_plot_iteration = 540000
+
+    if select == '2_all':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_emb':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='g', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='Cnn13_512', color='g', alpha=test_alpha)
+        # lines.append(line)
+
+    elif select == '2_bal':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_sr':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_partial':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'partial_0.9_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='cnn14,partial_0.9', color='b', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'partial_0.7_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='cnn14,partial_0.7', color='k', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    elif select == '2_melbins':
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax.plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax.plot(bal_map, color='r', alpha=bal_alpha)
+        line, = ax.plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+    ax.set_ylim(0, 0.8)
+    ax.set_xlim(0, len(iterations))
+    ax.set_xlabel('Iterations')
+    ax.set_ylabel('mAP')
+    ax.xaxis.set_ticks(np.arange(0, len(iterations), 50))
+    # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+    ax.xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+    ax.yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+    ax.yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+    # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2))        
+    ax.yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+    ax.xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+    plt.legend(handles=lines, loc=2)
+    plt.tight_layout(0, 0, 0)
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def plot_for_paper2(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    
+    classes_num = config.classes_num
+    max_plot_iteration = 1000000
+    iterations = np.arange(0, max_plot_iteration, 2000)
+
+    class_labels_indices_path = os.path.join(dataset_dir, 'metadata', 
+        'class_labels_indices.csv')
+        
+    save_out_path = 'results/paper2.pdf'
+    create_folder(os.path.dirname(save_out_path))
+    
+    # Read labels
+    labels = config.labels
+    
+    # Plot
+    fig, ax = plt.subplots(2, 3, figsize=(14, 7))
+    lines = []
+        
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+
+    def _load_metrics0(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size):
+        workspace0 = '/mnt/cephfs_new_wj/speechsv/qiuqiang.kong/workspaces/pub_audioset_tagging_cnn_transfer'
+        statistics_path = os.path.join(workspace0, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        bal_map = np.array([statistics['average_precision'] for statistics in statistics_dict['bal']])    # (N, classes_num)
+        bal_map = np.mean(bal_map, axis=-1)
+        test_map = np.array([statistics['average_precision'] for statistics in statistics_dict['test']])    # (N, classes_num)
+        test_map = np.mean(test_map, axis=-1)
+        legend = '{}, {}, bal={}, aug={}, bs={}'.format(data_type, model_type, balanced, augmentation, batch_size)
+
+        # return {'bal_map': bal_map, 'test_map': test_map, 'legend': legend}
+        return bal_map, test_map, legend
+        
+    bal_alpha = 0.3
+    test_alpha = 1.0
+    lines = []
+    linewidth = 1.
+
+    max_plot_iteration = 540000
+
+    if True:
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='CNN14', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn9', color='r', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='cnn5', color='g', alpha=test_alpha)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='MobileNetV1', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='b', alpha=bal_alpha)
+        # line, = ax.plot(test_map, label='Cnn1d_ResNet34', color='grey', alpha=test_alpha)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax[0, 0].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax[0, 0].plot(test_map, label='ResNet38', color='k', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        # (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+        #     320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32)
+        # line, = ax.plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        # line, = ax.plot(test_map, label='Wavegram-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        # lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 0].plot(test_map, label='Wavegram-Logmel-CNN', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 0].legend(handles=lines, loc=2)
+        ax[0, 0].set_title('(a) Comparison of architectures')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (1.9m)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+        line, = ax[0, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,no-bal,no-mixup (1.9m)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 1].plot(bal_map, color='y', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup-wav (1.9m)', color='y', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax[0, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (1.9m)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+        line, = ax[0, 1].plot(bal_map, color='k', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,no-mixup (20k)', color='k', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 1].plot(bal_map, color='m', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 1].plot(test_map, label='CNN14,bal,mixup (20k)', color='m', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 1].legend(handles=lines, loc=2, fontsize=8)
+
+        ax[0, 1].set_title('(b) Comparison of training data and augmentation')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=2048', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 2].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=32', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics0('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[0, 2].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[0, 2].plot(test_map, label='CNN14,emb=128', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[0, 2].legend(handles=lines, loc=2)
+        ax[0, 2].set_title('(c) Comparison of embedding size')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 0].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (100% full)', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 0].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='CNN14 (80% full)', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 0].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 0].plot(test_map, label='cnn14 (50% full)', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 0].legend(handles=lines, loc=2)
+        ax[1, 0].set_title('(d) Comparison of amount of training data')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 1].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,32kHz', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 1].plot(bal_map, color='b', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,16kHz', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 1].plot(bal_map, color='g', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 1].plot(test_map, label='CNN14,8kHz', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 1].legend(handles=lines, loc=2)
+        ax[1, 1].set_title('(e) Comparison of sampling rate')
+
+    if True:
+        lines = []
+        iterations = np.arange(0, max_plot_iteration, 2000)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 2].plot(bal_map, color='r', alpha=bal_alpha, linewidth=linewidth)
+        line, = ax[1, 2].plot(test_map, label='CNN14,64-melbins', color='r', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 2].plot(bal_map, color='b', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,32-melbins', color='b', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        (bal_map, test_map, legend) = _load_metrics('main', 32000, 1024, 
+            320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+        line, = ax[1, 2].plot(bal_map, color='g', alpha=bal_alpha)
+        line, = ax[1, 2].plot(test_map, label='CNN14,128-melbins', color='g', alpha=test_alpha, linewidth=linewidth)
+        lines.append(line)
+
+        ax[1, 2].legend(handles=lines, loc=2)
+        ax[1, 2].set_title('(f) Comparison of mel bins number')
+
+    for i in range(2):
+        for j in range(3):
+            ax[i, j].set_ylim(0, 0.8)
+            ax[i, j].set_xlim(0, len(iterations))
+            ax[i, j].set_xlabel('Iterations')
+            ax[i, j].set_ylabel('mAP')
+            ax[i, j].xaxis.set_ticks(np.arange(0, len(iterations), 50))
+            # ax.xaxis.set_ticklabels(np.arange(0, max_plot_iteration, 50000))
+            ax[i, j].xaxis.set_ticklabels(['0', '100k', '200k', '300k', '400k', '500k'])
+            ax[i, j].yaxis.set_ticks(np.arange(0, 0.81, 0.05))
+            ax[i, j].yaxis.set_ticklabels(['0', '', '0.1', '', '0.2', '', '0.3', '', '0.4', '', '0.5', '', '0.6', '', '0.7', '', '0.8'])
+            # ax.yaxis.set_ticklabels(np.around(np.arange(0, 0.81, 0.05), decimals=2))        
+            ax[i, j].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+            ax[i, j].xaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+
+    plt.tight_layout(0, 1, 0)
+    # box = ax.get_position()
+    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # ax.legend(handles=lines, bbox_to_anchor=(1.0, 1.0))
+
+    plt.savefig(save_out_path)
+    print('Save figure to {}'.format(save_out_path))
+
+
+def table_values(args):
+    
+    # Arguments & parameters
+    dataset_dir = args.dataset_dir
+    workspace = args.workspace
+    select = args.select
+    
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+ 
+        idx = iteration // 2000
+        mAP = np.mean(statistics_dict['test'][idx]['average_precision'])
+        mAUC = np.mean(statistics_dict['test'][idx]['auc'])
+        dprime = d_prime(mAUC)
+
+        print('mAP: {:.3f}'.format(mAP))
+        print('mAUC: {:.3f}'.format(mAUC))
+        print('dprime: {:.3f}'.format(dprime))
+
+
+    if select == 'cnn13':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn5':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn5', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn9':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn9', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_decisionlevelmax':
+        iteration = 400000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelMax', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+    
+    elif select == 'cnn13_decisionlevelavg':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAvg', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_decisionlevelatt':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_DecisionLevelAtt', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_emb32':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_emb128':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_emb512':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_emb512', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_hop500':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            500, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_hop640':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            640, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'cnn13_hop1000':
+        iteration = 540000
+        _load_metrics('main', 32000, 1024, 
+            1000, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'mobilenetv1':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'mobilenetv2':
+        iteration = 560000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'MobileNetV2', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet18':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet34':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet50':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'dainet':
+        iteration = 600000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_DaiNet', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'leenet':
+        iteration = 540000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'leenet18':
+        iteration = 440000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_LeeNet18', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet34_1d':
+        iteration = 500000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet34', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'resnet50_1d':
+        iteration = 500000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn1d_ResNet50', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'waveform_cnn2d':
+        iteration = 660000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_WavCnn2d', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    elif select == 'waveform_spandwav':
+        iteration = 700000
+        _load_metrics('main', 32000, 1024, 
+            320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+ 
+def crop_label(label):
+    max_len = 16
+    if len(label) <= max_len:
+        return label
+    else:
+        words = label.split(' ')
+        cropped_label = ''
+        for w in words:
+            if len(cropped_label + ' ' + w) > max_len:
+                break
+            else:
+                cropped_label += ' {}'.format(w)
+    return cropped_label
+
+def add_comma(integer):
+    integer = int(integer)
+    if integer >= 1000:
+        return str(integer // 1000) + ',' + str(integer % 1000)
+    else:
+        return str(integer)
+
+
+def plot_class_iteration(args):
+    
+    # Arguments & parameters
+    workspace = args.workspace
+    select = args.select
+    
+    save_out_path = 'results_map/class_iteration_map.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+        statistics_path = os.path.join(workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+        return statistics_dict
+
+    iteration = 600000
+    statistics_dict = _load_metrics('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    mAP_mat = np.array([e['average_precision'] for e in statistics_dict['test']])
+    mAP_mat = mAP_mat[0 : 300, :]
+    sorted_indexes = np.argsort(config.full_samples_per_class)[::-1]
+
+
+    fig, axs = plt.subplots(1, 3, figsize=(20, 5))
+    ranges = [np.arange(0, 10), np.arange(250, 260), np.arange(517, 527)]
+    axs[0].set_ylabel('AP')
+
+    for col in range(0, 3):
+        axs[col].set_ylim(0, 1.)
+        axs[col].set_xlim(0, 301)
+        axs[col].set_xlabel('Iterations')
+        axs[col].set_ylabel('AP')
+        axs[col].xaxis.set_ticks(np.arange(0, 301, 100))
+        axs[col].xaxis.set_ticklabels(['0', '200k', '400k', '600k'])
+        lines = []
+        for _ix in ranges[col]:
+            _label = crop_label(config.labels[sorted_indexes[_ix]]) + \
+                ' ({})'.format(add_comma(config.full_samples_per_class[sorted_indexes[_ix]]))
+            line, = axs[col].plot(mAP_mat[:, sorted_indexes[_ix]], label=_label)
+            lines.append(line)
+        box = axs[col].get_position()
+        axs[col].set_position([box.x0, box.y0, box.width * 1., box.height])
+        axs[col].legend(handles=lines, bbox_to_anchor=(1., 1.))
+        axs[col].yaxis.grid(color='k', linestyle='solid', alpha=0.3, linewidth=0.3)
+ 
+    plt.tight_layout(pad=4, w_pad=1, h_pad=1)
+    plt.savefig(save_out_path)
+    print(save_out_path)
+
+
+def _load_old_metrics(workspace, filename, iteration, data_type):
+    
+    assert data_type in ['train', 'test']
+    
+    stat_name = "stat_{}_iters.p".format(iteration)
+
+    # Load stats
+    stat_path = os.path.join(workspace, "stats", filename, data_type, stat_name)
+    try:
+        stats = cPickle.load(open(stat_path, 'rb'))
+    except:
+        stats = cPickle.load(open(stat_path, 'rb'), encoding='latin1')
+
+    precisions = [stat['precisions'] for stat in stats]
+    recalls = [stat['recalls'] for stat in stats]
+    maps = np.array([stat['AP'] for stat in stats])
+    aucs = np.array([stat['auc'] for stat in stats])
+    
+    return {'average_precision': maps, 'AUC': aucs}
+
+def _sort(ys):
+    sorted_idxes = np.argsort(ys)
+    sorted_idxes = sorted_idxes[::-1]
+    sorted_ys = ys[sorted_idxes]
+    sorted_lbs = [config.labels[e] for e in sorted_idxes]
+    return sorted_ys, sorted_idxes, sorted_lbs
+
+def load_data(hdf5_path):
+    with h5py.File(hdf5_path, 'r') as hf:
+        x = hf['x'][:]
+        y = hf['y'][:]
+        video_id_list = list(hf['video_id_list'][:])
+    return x, y, video_id_list
+
+def get_avg_stats(workspace, bgn_iter, fin_iter, interval_iter, filename, data_type):
+    
+    assert data_type in ['train', 'test']
+    bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5"
+    eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5"
+    unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5"
+    
+    t1 = time.time()
+    if data_type == 'test':
+        (te_x, te_y, te_id_list) = load_data(eval_hdf5)
+    elif data_type == 'train':
+        (te_x, te_y, te_id_list) = load_data(bal_train_hdf5)
+    y = te_y
+    
+    prob_dir = os.path.join(workspace, "probs", filename, data_type)
+    names = os.listdir(prob_dir)
+    
+    probs = []
+    iters = range(bgn_iter, fin_iter, interval_iter)
+    for iter in iters:
+        pickle_path = os.path.join(prob_dir, "prob_%d_iters.p" % iter)
+        try:
+            prob = cPickle.load(open(pickle_path, 'rb'))
+        except:
+            prob = cPickle.load(open(pickle_path, 'rb'), encoding='latin1')
+        probs.append(prob)
+    
+    avg_prob = np.mean(np.array(probs), axis=0)
+    
+    n_out = y.shape[1]
+    stats = []
+    for k in range(n_out): # around 7 seconds
+        (precisions, recalls, thresholds) = metrics.precision_recall_curve(y[:, k], avg_prob[:, k])
+        avg_precision = metrics.average_precision_score(y[:, k], avg_prob[:, k], average=None)
+        (fpr, tpr, thresholds) = metrics.roc_curve(y[:, k], avg_prob[:, k])
+        auc = metrics.roc_auc_score(y[:, k], avg_prob[:, k], average=None)
+        # eer = pp_data.eer(avg_prob[:, k], y[:, k])
+        
+        skip = 1000
+        dict = {'precisions': precisions[0::skip], 'recalls': recalls[0::skip], 'AP': avg_precision, 
+                'fpr': fpr[0::skip], 'fnr': 1. - tpr[0::skip], 'auc': auc}
+        
+        stats.append(dict)
+        
+    mAPs = np.array([e['AP'] for e in stats])
+    aucs = np.array([e['auc'] for e in stats])
+        
+    print("Get avg time: {}".format(time.time() - t1))
+        
+    return {'average_precision': mAPs, 'auc': aucs}
+
+
+def _samples_num_per_class():
+    bal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/bal_train.h5"
+    eval_hdf5 = "/vol/vssp/msos/audioset/packed_features/eval.h5"
+    unbal_train_hdf5 = "/vol/vssp/msos/audioset/packed_features/unbal_train.h5"
+
+    (x, y, id_list) = load_data(eval_hdf5)
+    eval_num = np.sum(y, axis=0)
+
+    (x, y, id_list) = load_data(bal_train_hdf5)
+    bal_num = np.sum(y, axis=0)
+
+    (x, y, id_list) = load_data(unbal_train_hdf5)
+    unbal_num = np.sum(y, axis=0)
+
+    return bal_num, unbal_num, eval_num
+
+
+def get_label_quality():
+    
+    rate_csv = '/vol/vssp/msos/qk/workspaces/pub_audioset_tagging_cnn_transfer/metadata/qa_true_counts.csv'
+    
+    with open(rate_csv, 'r') as f:
+        reader = csv.reader(f, delimiter=',')
+        lis = list(reader)
+        
+    rates = []
+
+    for n in range(1, len(lis)):
+        li = lis[n]
+        if float(li[1]) == 0:
+            rate = None
+        else:
+            rate = float(li[2]) / float(li[1])
+        rates.append(rate)
+    
+    return rates
+
+
+def summary_stats(args):
+    # Arguments & parameters
+    workspace = args.workspace
+
+    out_stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+    create_folder(os.path.dirname(out_stat_path))
+
+    # Old workspace
+    old_workspace = '/vol/vssp/msos/qk/workspaces/audioset_classification'
+
+    # bal_train_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'train')
+    # eval_metrics = _load_old_metrics(old_workspace, 'tmp127', 20000, 'test')
+    
+    bal_train_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='train')
+    eval_metrics = get_avg_stats(old_workspace, bgn_iter=10000, fin_iter=50001, interval_iter=5000, filename='tmp127_re', data_type='test')
+
+    maps0te = eval_metrics['average_precision']
+    (maps0te, sorted_idxes, sorted_lbs) = _sort(maps0te)
+
+    bal_num, unbal_num, eval_num = _samples_num_per_class()
+
+    output_dict = {
+        'labels': config.labels, 
+        'label_quality': get_label_quality(), 
+        'sorted_indexes_for_plot': sorted_idxes, 
+        'official_balanced_trainig_samples': bal_num, 
+        'official_unbalanced_training_samples': unbal_num, 
+        'official_eval_samples': eval_num, 
+        'downloaded_full_training_samples': config.full_samples_per_class, 
+        'averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations': 
+            {'bal_train': bal_train_metrics, 'eval': eval_metrics}
+        }
+
+    def _load_metrics(filename, sample_rate, window_size, hop_size, mel_bins, fmin, 
+        fmax, data_type, model_type, loss_type, balanced, augmentation, batch_size, iteration):
+        _workspace = '/vol/vssp/msos/qk/bytedance/workspaces_important/pub_audioset_tagging_cnn_transfer'
+        statistics_path = os.path.join(_workspace, 'statistics', filename, 
+            'sample_rate={},window_size={},hop_size={},mel_bins={},fmin={},fmax={}'.format(
+            sample_rate, window_size, hop_size, mel_bins, fmin, fmax), 
+            'data_type={}'.format(data_type), model_type, 
+            'loss_type={}'.format(loss_type), 'balanced={}'.format(balanced), 
+            'augmentation={}'.format(augmentation), 'batch_size={}'.format(batch_size), 
+            'statistics.pkl')
+
+        statistics_dict = cPickle.load(open(statistics_path, 'rb'))
+
+        _idx = iteration // 2000
+        _dict = {'bal_train': {'average_precision': statistics_dict['bal'][_idx]['average_precision'], 
+                                'auc': statistics_dict['bal'][_idx]['auc']}, 
+                'eval': {'average_precision': statistics_dict['test'][_idx]['average_precision'], 
+                        'auc': statistics_dict['test'][_idx]['auc']}}
+        return _dict
+
+    iteration = 600000
+    output_dict['cnn13_system_iteration60k'] = _load_metrics('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'Cnn13', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    iteration = 560000
+    output_dict['mobilenetv1_system_iteration56k'] = _load_metrics('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'MobileNetV1', 'clip_bce', 'balanced', 'mixup', 32, iteration)
+
+    cPickle.dump(output_dict, open(out_stat_path, 'wb'))
+    print('Write stats for paper to {}'.format(out_stat_path))
+
+ 
+def prepare_plot_long_4_rows(sorted_lbs):
+    N = len(sorted_lbs)
+
+    f,(ax1a, ax2a, ax3a, ax4a) = plt.subplots(4, 1,sharey=False, facecolor='w', figsize=(10, 12))
+
+    fontsize = 5
+
+    K = 132
+    ax1a.set_xlim(0, K)
+    ax2a.set_xlim(K, 2 * K)
+    ax3a.set_xlim(2 * K, 3 * K)
+    ax4a.set_xlim(3 * K, N)
+    
+    truncated_sorted_lbs = []
+    for lb in sorted_lbs:
+        lb = lb[0 : 25]
+        words = lb.split(' ')
+        if len(words[-1]) < 3:
+            lb = ' '.join(words[0:-1])
+        truncated_sorted_lbs.append(lb)
+  
+    ax1a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax2a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax3a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    ax4a.grid(which='major', axis='x', linestyle='-', alpha=0.3)
+    
+    ax1a.set_yscale('log')
+    ax2a.set_yscale('log')
+    ax3a.set_yscale('log')
+    ax4a.set_yscale('log')
+    
+    ax1b = ax1a.twinx()
+    ax2b = ax2a.twinx()
+    ax3b = ax3a.twinx()
+    ax4b = ax4a.twinx()
+    ax1b.set_ylim(0., 1.)
+    ax2b.set_ylim(0., 1.)
+    ax3b.set_ylim(0., 1.)
+    ax4b.set_ylim(0., 1.)
+    ax1b.set_ylabel('Average precision')
+    ax2b.set_ylabel('Average precision')
+    ax3b.set_ylabel('Average precision')
+    ax4b.set_ylabel('Average precision')
+    
+    ax1b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax2b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax3b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    ax4b.yaxis.grid(color='grey', linestyle='--', alpha=0.5)
+    
+    ax1a.xaxis.set_ticks(np.arange(K))
+    ax1a.xaxis.set_ticklabels(truncated_sorted_lbs[0:K], rotation=90, fontsize=fontsize)
+    ax1a.xaxis.tick_bottom()
+    ax1a.set_ylabel("Number of audio clips")
+    
+    ax2a.xaxis.set_ticks(np.arange(K, 2*K))
+    ax2a.xaxis.set_ticklabels(truncated_sorted_lbs[K:2*K], rotation=90, fontsize=fontsize)
+    ax2a.xaxis.tick_bottom()
+    # ax2a.tick_params(left='off', which='both')
+    ax2a.set_ylabel("Number of audio clips")
+    
+    ax3a.xaxis.set_ticks(np.arange(2*K, 3*K))
+    ax3a.xaxis.set_ticklabels(truncated_sorted_lbs[2*K:3*K], rotation=90, fontsize=fontsize)
+    ax3a.xaxis.tick_bottom()
+    ax3a.set_ylabel("Number of audio clips")
+    
+    ax4a.xaxis.set_ticks(np.arange(3*K, N))
+    ax4a.xaxis.set_ticklabels(truncated_sorted_lbs[3*K:], rotation=90, fontsize=fontsize)
+    ax4a.xaxis.tick_bottom()
+    # ax4a.tick_params(left='off', which='both')
+    ax4a.set_ylabel("Number of audio clips")
+    
+    ax1a.spines['right'].set_visible(False)
+    ax1b.spines['right'].set_visible(False)
+    ax2a.spines['left'].set_visible(False)
+    ax2b.spines['left'].set_visible(False)
+    ax2a.spines['right'].set_visible(False)
+    ax2b.spines['right'].set_visible(False)
+    ax3a.spines['left'].set_visible(False)
+    ax3b.spines['left'].set_visible(False)
+    ax3a.spines['right'].set_visible(False)
+    ax3b.spines['right'].set_visible(False)
+    ax4a.spines['left'].set_visible(False)
+    ax4b.spines['left'].set_visible(False)
+    
+    plt.subplots_adjust(hspace = 0.8)
+    
+    return ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b
+
+def _scatter_4_rows(x, ax, ax2, ax3, ax4, s, c, marker='.', alpha=1.):
+    N = len(x)
+    ax.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax2.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax3.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+    ax4.scatter(np.arange(N), x, s=s, c=c, marker=marker, alpha=alpha)
+
+def _plot_4_rows(x, ax, ax2, ax3, ax4, c, linewidth=1.0, alpha=1.0, label=""):
+    N = len(x)
+    ax.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax2.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    ax3.plot(x, c=c, linewidth=linewidth, alpha=alpha)
+    line, = ax4.plot(x, c=c, linewidth=linewidth, alpha=alpha, label=label)
+    return line
+
+def plot_long_fig(args):
+    # Arguments & parameters
+    workspace = args.workspace
+    
+    # Paths
+    stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+    save_out_path = 'results/long_fig.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    # Stats
+    stats = cPickle.load(open(stat_path, 'rb'))
+
+    N = len(config.labels)
+    sorted_indexes = stats['sorted_indexes_for_plot']
+    sorted_labels = np.array(config.labels)[sorted_indexes]
+    audio_clips_per_class = stats['official_balanced_trainig_samples'] + stats['official_unbalanced_training_samples']
+    audio_clips_per_class = audio_clips_per_class[sorted_indexes]
+
+    (ax1a, ax2a, ax3a, ax4a, ax1b, ax2b, ax3b, ax4b) = prepare_plot_long_4_rows(sorted_labels)
+ 
+    # plot the same data on both axes
+    ax1a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax2a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax3a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+    ax4a.bar(np.arange(N), audio_clips_per_class, alpha=0.3)
+   
+    maps_avg_instances = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+    maps_avg_instances = maps_avg_instances[sorted_indexes]
+
+    maps_cnn13 = stats['cnn13_system_iteration60k']['eval']['average_precision']
+    maps_cnn13 = maps_cnn13[sorted_indexes]
+
+    maps_mobilenetv1 = stats['mobilenetv1_system_iteration56k']['eval']['average_precision']
+    maps_mobilenetv1 = maps_mobilenetv1[sorted_indexes]
+
+    maps_logmel_wavegram_cnn = _load_metrics0_classwise('main', 32000, 1024, 
+        320, 64, 50, 14000, 'full_train', 'Cnn13_SpAndWav', 'clip_bce', 'balanced', 'mixup', 32)
+    maps_logmel_wavegram_cnn = maps_logmel_wavegram_cnn[sorted_indexes]
+
+    _scatter_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, s=5, c='k')
+    _scatter_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, s=5, c='r')
+    _scatter_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, s=5, c='b')
+    _scatter_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, s=5, c='g')
+    
+    linewidth = 0.7
+    line0te = _plot_4_rows(maps_avg_instances, ax1b, ax2b, ax3b, ax4b, c='k', linewidth=linewidth, label='AP with averaging instances (baseline)')
+    line1te = _plot_4_rows(maps_cnn13, ax1b, ax2b, ax3b, ax4b, c='r', linewidth=linewidth, label='AP with CNN14')
+    line2te = _plot_4_rows(maps_mobilenetv1, ax1b, ax2b, ax3b, ax4b, c='b', linewidth=linewidth, label='AP with MobileNetV1')
+    line3te = _plot_4_rows(maps_logmel_wavegram_cnn, ax1b, ax2b, ax3b, ax4b, c='g', linewidth=linewidth, label='AP with Wavegram-Logmel-CNN')
+
+    label_quality = stats['label_quality']
+    sorted_rate = np.array(label_quality)[sorted_indexes]
+    for k in range(len(sorted_rate)):
+        if sorted_rate[k] and sorted_rate[k] == 1:
+            sorted_rate[k] = 0.99
+    
+    ax1b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax2b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+    ax3b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+')
+    line_label_quality = ax4b.scatter(np.arange(N)[sorted_rate != None], sorted_rate[sorted_rate != None], s=12, c='r', linewidth=0.8, marker='+', label='Label quality')
+    ax1b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax2b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax3b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    ax4b.scatter(np.arange(N)[sorted_rate == None], 0.5 * np.ones(len(np.arange(N)[sorted_rate == None])), s=12, c='r', linewidth=0.8, marker='_')
+    
+    plt.legend(handles=[line0te, line1te, line2te, line3te, line_label_quality], fontsize=6, loc=1)
+    
+    plt.savefig(save_out_path)
+    print('Save fig to {}'.format(save_out_path))
+ 
+def plot_flops(args):
+
+    # Arguments & parameters
+    workspace = args.workspace
+    
+    # Paths
+    save_out_path = 'results_map/flops.pdf'
+    create_folder(os.path.dirname(save_out_path))
+
+    plt.figure(figsize=(5, 5))
+    fig, ax = plt.subplots(1, 1)
+
+    model_types = np.array(['Cnn6', 'Cnn10', 'Cnn14', 'ResNet22', 'ResNet38', 'ResNet54', 
+        'MobileNetV1', 'MobileNetV2', 'DaiNet', 'LeeNet', 'LeeNet18', 
+        'Res1dNet30', 'Res1dNet44', 'Wavegram-CNN', 'Wavegram-\nLogmel-CNN'])
+    flops = np.array([21.986, 21.986, 42.220, 30.081, 48.962, 54.563, 3.614, 2.810, 
+        30.395, 4.741, 26.369, 32.688, 61.833, 44.234, 53.510])
+    mAPs = np.array([0.343, 0.380, 0.431, 0.430, 0.434, 0.429, 0.389, 0.383, 0.295, 
+        0.266, 0.336, 0.365, 0.355, 0.389, 0.439])
+
+    sorted_indexes = np.sort(flops)
+    ax.scatter(flops, mAPs)
+
+    shift = [[1, 0.002], [1, -0.006], [-1, -0.014], [-2, 0.006], [-7, 0.006], 
+        [1, -0.01], [0.5, 0.004], [-1, -0.014], [1, -0.007], [0.8, -0.008], 
+        [1, -0.007], [1, 0.002], [-6, -0.015], [1, -0.008], [0.8, 0]]
+
+    for i, model_type in enumerate(model_types):
+        ax.annotate(model_type, (flops[i] + shift[i][0], mAPs[i] + shift[i][1]))
+
+    ax.plot(flops[[0, 1, 2]], mAPs[[0, 1, 2]])
+    ax.plot(flops[[3, 4, 5]], mAPs[[3, 4, 5]])
+    ax.plot(flops[[6, 7]], mAPs[[6, 7]])
+    ax.plot(flops[[9, 10]], mAPs[[9, 10]])
+    ax.plot(flops[[11, 12]], mAPs[[11, 12]])
+    ax.plot(flops[[13, 14]], mAPs[[13, 14]])
+
+    ax.set_xlim(0, 70)
+    ax.set_ylim(0.2, 0.5)
+    ax.set_xlabel('Multi-adds (million)')
+    ax.set_ylabel('mAP')
+
+    plt.tight_layout(0, 0, 0)
+
+    plt.savefig(save_out_path)
+    print('Write out figure to {}'.format(save_out_path))
+
+
+def spearman(args):
+
+    # Arguments & parameters
+    workspace = args.workspace
+
+    # Paths
+    stat_path = os.path.join(workspace, 'results', 'stats_for_paper.pkl')
+
+    # Stats
+    stats = cPickle.load(open(stat_path, 'rb'))
+
+    label_quality = np.array([qu if qu else 0.5 for qu in stats['label_quality']])
+    training_samples = np.array(stats['official_balanced_trainig_samples']) + \
+        np.array(stats['official_unbalanced_training_samples'])
+    mAP = stats['averaging_instance_system_avg_9_probs_from_10000_to_50000_iterations']['eval']['average_precision']
+
+    import scipy
+    samples_spearman = scipy.stats.spearmanr(training_samples, mAP)[0]
+    quality_spearman = scipy.stats.spearmanr(label_quality, mAP)[0]
+
+    print('Training samples spearman: {:.3f}'.format(samples_spearman))
+    print('Quality spearman: {:.3f}'.format(quality_spearman))
+
+
+def print_results(args):
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_mixup_time_domain', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'none', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'none', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'balanced_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # 
+    (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb32', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics0_classwise2('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn13_emb128', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # partial
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.8_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'partial_0.5_full_train', 'Cnn14', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # Sample rate
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_16k', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 64, 50, 14000, 'full_train', 'Cnn14_8k', 'clip_bce', 'balanced', 'mixup', 32)
+
+    # Mel bins
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 128, 50, 14000, 'full_train', 'Cnn14_mel128', 'clip_bce', 'balanced', 'mixup', 32)
+
+    (mAP, mAUC, dprime) = _load_metrics_classwise('main', 32000, 1024, 320, 32, 50, 14000, 'full_train', 'Cnn14_mel32', 'clip_bce', 'balanced', 'mixup', 32)
+
+    import crash
+    asdf
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description='')
+    subparsers = parser.add_subparsers(dest='mode')
+    
+    parser_plot = subparsers.add_parser('plot')
+    parser_plot.add_argument('--dataset_dir', type=str, required=True)
+    parser_plot.add_argument('--workspace', type=str, required=True)
+    parser_plot.add_argument('--select', type=str, required=True)
+    
+    parser_plot = subparsers.add_parser('plot_for_paper')
+    parser_plot.add_argument('--dataset_dir', type=str, required=True)
+    parser_plot.add_argument('--workspace', type=str, required=True)
+    parser_plot.add_argument('--select', type=str, required=True)
+
+    parser_plot = subparsers.add_parser('plot_for_paper2')
+    parser_plot.add_argument('--dataset_dir', type=str, required=True)
+    parser_plot.add_argument('--workspace', type=str, required=True)
+
+    parser_values = subparsers.add_parser('plot_class_iteration')
+    parser_values.add_argument('--workspace', type=str, required=True)
+    parser_values.add_argument('--select', type=str, required=True)
+
+    parser_summary_stats = subparsers.add_parser('summary_stats')
+    parser_summary_stats.add_argument('--workspace', type=str, required=True)
+
+    parser_plot_long = subparsers.add_parser('plot_long_fig')
+    parser_plot_long.add_argument('--workspace', type=str, required=True)
+
+    parser_plot_flops = subparsers.add_parser('plot_flops')
+    parser_plot_flops.add_argument('--workspace', type=str, required=True)
+ 
+    parser_spearman = subparsers.add_parser('spearman')
+    parser_spearman.add_argument('--workspace', type=str, required=True)
+
+    parser_print = subparsers.add_parser('print')
+    parser_print.add_argument('--workspace', type=str, required=True)
+
+    args = parser.parse_args()
+
+    if args.mode == 'plot':
+        plot(args)
+
+    elif args.mode == 'plot_for_paper':
+        plot_for_paper(args)
+
+    elif args.mode == 'plot_for_paper2':
+        plot_for_paper2(args)
+        
+    elif args.mode == 'table_values':
+        table_values(args)
+
+    elif args.mode == 'plot_class_iteration':
+        plot_class_iteration(args)
+
+    elif args.mode == 'summary_stats':
+        summary_stats(args)
+
+    elif args.mode == 'plot_long_fig':
+        plot_long_fig(args)
+
+    elif args.mode == 'plot_flops':
+        plot_flops(args)
+
+    elif args.mode == 'spearman':
+        spearman(args)
+
+    elif args.mode == 'print':
+        print_results(args)
+
+    else:
+        raise Exception('Error argument!')
\ No newline at end of file
diff --git a/audio_detection/audio_infer/utils/utilities.py b/audio_detection/audio_infer/utils/utilities.py
new file mode 100644
index 0000000..8d16045
--- /dev/null
+++ b/audio_detection/audio_infer/utils/utilities.py
@@ -0,0 +1,172 @@
+import os
+import logging
+import h5py
+import soundfile
+import librosa
+import numpy as np
+import pandas as pd
+from scipy import stats 
+import datetime
+import pickle
+
+
+def create_folder(fd):
+    if not os.path.exists(fd):
+        os.makedirs(fd)
+        
+        
+def get_filename(path):
+    path = os.path.realpath(path)
+    na_ext = path.split('/')[-1]
+    na = os.path.splitext(na_ext)[0]
+    return na
+
+
+def get_sub_filepaths(folder):
+    paths = []
+    for root, dirs, files in os.walk(folder):
+        for name in files:
+            path = os.path.join(root, name)
+            paths.append(path)
+    return paths
+    
+    
+def create_logging(log_dir, filemode):
+    create_folder(log_dir)
+    i1 = 0
+
+    while os.path.isfile(os.path.join(log_dir, '{:04d}.log'.format(i1))):
+        i1 += 1
+        
+    log_path = os.path.join(log_dir, '{:04d}.log'.format(i1))
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
+        datefmt='%a, %d %b %Y %H:%M:%S',
+        filename=log_path,
+        filemode=filemode)
+
+    # Print to console
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('').addHandler(console)
+    
+    return logging
+
+
+def read_metadata(csv_path, classes_num, id_to_ix):
+    """Read metadata of AudioSet from a csv file.
+
+    Args:
+      csv_path: str
+
+    Returns:
+      meta_dict: {'audio_name': (audios_num,), 'target': (audios_num, classes_num)}
+    """
+
+    with open(csv_path, 'r') as fr:
+        lines = fr.readlines()
+        lines = lines[3:]   # Remove heads
+
+    audios_num = len(lines)
+    targets = np.zeros((audios_num, classes_num), dtype=np.bool)
+    audio_names = []
+ 
+    for n, line in enumerate(lines):
+        items = line.split(', ')
+        """items: ['--4gqARaEJE', '0.000', '10.000', '"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"\n']"""
+
+        audio_name = 'Y{}.wav'.format(items[0])   # Audios are started with an extra 'Y' when downloading
+        label_ids = items[3].split('"')[1].split(',')
+
+        audio_names.append(audio_name)
+
+        # Target
+        for id in label_ids:
+            ix = id_to_ix[id]
+            targets[n, ix] = 1
+    
+    meta_dict = {'audio_name': np.array(audio_names), 'target': targets}
+    return meta_dict
+
+
+def float32_to_int16(x):
+    assert np.max(np.abs(x)) <= 1.2
+    x = np.clip(x, -1, 1)
+    return (x * 32767.).astype(np.int16)
+
+def int16_to_float32(x):
+    return (x / 32767.).astype(np.float32)
+    
+
+def pad_or_truncate(x, audio_length):
+    """Pad all audio to specific length."""
+    if len(x) <= audio_length:
+        return np.concatenate((x, np.zeros(audio_length - len(x))), axis=0)
+    else:
+        return x[0 : audio_length]
+
+
+def d_prime(auc):
+    d_prime = stats.norm().ppf(auc) * np.sqrt(2.0)
+    return d_prime
+
+
+class Mixup(object):
+    def __init__(self, mixup_alpha, random_seed=1234):
+        """Mixup coefficient generator.
+        """
+        self.mixup_alpha = mixup_alpha
+        self.random_state = np.random.RandomState(random_seed)
+
+    def get_lambda(self, batch_size):
+        """Get mixup random coefficients.
+        Args:
+          batch_size: int
+        Returns:
+          mixup_lambdas: (batch_size,)
+        """
+        mixup_lambdas = []
+        for n in range(0, batch_size, 2):
+            lam = self.random_state.beta(self.mixup_alpha, self.mixup_alpha, 1)[0]
+            mixup_lambdas.append(lam)
+            mixup_lambdas.append(1. - lam)
+
+        return np.array(mixup_lambdas)
+
+
+class StatisticsContainer(object):
+    def __init__(self, statistics_path):
+        """Contain statistics of different training iterations.
+        """
+        self.statistics_path = statistics_path
+
+        self.backup_statistics_path = '{}_{}.pkl'.format(
+            os.path.splitext(self.statistics_path)[0], 
+            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+
+        self.statistics_dict = {'bal': [], 'test': []}
+
+    def append(self, iteration, statistics, data_type):
+        statistics['iteration'] = iteration
+        self.statistics_dict[data_type].append(statistics)
+        
+    def dump(self):
+        pickle.dump(self.statistics_dict, open(self.statistics_path, 'wb'))
+        pickle.dump(self.statistics_dict, open(self.backup_statistics_path, 'wb'))
+        logging.info('    Dump statistics to {}'.format(self.statistics_path))
+        logging.info('    Dump statistics to {}'.format(self.backup_statistics_path))
+        
+    def load_state_dict(self, resume_iteration):
+        self.statistics_dict = pickle.load(open(self.statistics_path, 'rb'))
+
+        resume_statistics_dict = {'bal': [], 'test': []}
+        
+        for key in self.statistics_dict.keys():
+            for statistics in self.statistics_dict[key]:
+                if statistics['iteration'] <= resume_iteration:
+                    resume_statistics_dict[key].append(statistics)
+                
+        self.statistics_dict = resume_statistics_dict
\ No newline at end of file
diff --git a/audio_detection/target_sound_detection/src/models.py b/audio_detection/target_sound_detection/src/models.py
new file mode 100644
index 0000000..eeeec40
--- /dev/null
+++ b/audio_detection/target_sound_detection/src/models.py
@@ -0,0 +1,1292 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2021/3/9 16:33
+# @Author  : dongchao yang
+# @File    : train.py
+from itertools import zip_longest
+import numpy as np
+from scipy import ndimage
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import time
+from torchlibrosa.augmentation import SpecAugmentation
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+import math
+from sklearn.cluster import KMeans
+import os
+import time
+from functools import partial
+# import timm
+# from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+import warnings
+from functools import partial
+# from timm.models.registry import register_model
+# from timm.models.vision_transformer import _cfg
+# from mmdet.utils import get_root_logger
+# from mmcv.runner import load_checkpoint
+# from mmcv.runner import _load_checkpoint, load_state_dict
+# import mmcv.runner
+import copy
+from collections import OrderedDict
+import io
+import re
+DEBUG=0
+event_labels = ['Alarm', 'Alarm_clock', 'Animal', 'Applause', 'Arrow', 'Artillery_fire', 
+                'Babbling', 'Baby_laughter', 'Bark', 'Basketball_bounce', 'Battle_cry', 
+                'Bell', 'Bird', 'Bleat', 'Bouncing', 'Breathing', 'Buzz', 'Camera', 
+                'Cap_gun', 'Car', 'Car_alarm', 'Cat', 'Caw', 'Cheering', 'Child_singing', 
+                'Choir', 'Chop', 'Chopping_(food)', 'Clapping', 'Clickety-clack', 'Clicking', 
+                'Clip-clop', 'Cluck', 'Coin_(dropping)', 'Computer_keyboard', 'Conversation', 
+                'Coo', 'Cough', 'Cowbell', 'Creak', 'Cricket', 'Croak', 'Crow', 'Crowd', 'DTMF', 
+                'Dog', 'Door', 'Drill', 'Drip', 'Engine', 'Engine_starting', 'Explosion', 'Fart', 
+                'Female_singing', 'Filing_(rasp)', 'Finger_snapping', 'Fire', 'Fire_alarm', 'Firecracker', 
+                'Fireworks', 'Frog', 'Gasp', 'Gears', 'Giggle', 'Glass', 'Glass_shatter', 'Gobble', 'Groan', 
+                'Growling', 'Hammer', 'Hands', 'Hiccup', 'Honk', 'Hoot', 'Howl', 'Human_sounds', 'Human_voice', 
+                'Insect', 'Laughter', 'Liquid', 'Machine_gun', 'Male_singing', 'Mechanisms', 'Meow', 'Moo', 
+                'Motorcycle', 'Mouse', 'Music', 'Oink', 'Owl', 'Pant', 'Pant_(dog)', 'Patter', 'Pig', 'Plop',
+                'Pour', 'Power_tool', 'Purr', 'Quack', 'Radio', 'Rain_on_surface', 'Rapping', 'Rattle', 
+                'Reversing_beeps', 'Ringtone', 'Roar', 'Run', 'Rustle', 'Scissors', 'Scrape', 'Scratch', 
+                'Screaming', 'Sewing_machine', 'Shout', 'Shuffle', 'Shuffling_cards', 'Singing', 
+                'Single-lens_reflex_camera', 'Siren', 'Skateboard', 'Sniff', 'Snoring', 'Speech', 
+                'Speech_synthesizer', 'Spray', 'Squeak', 'Squeal', 'Steam', 'Stir', 'Surface_contact', 
+                'Tap', 'Tap_dance', 'Telephone_bell_ringing', 'Television', 'Tick', 'Tick-tock', 'Tools', 
+                'Train', 'Train_horn', 'Train_wheels_squealing', 'Truck', 'Turkey', 'Typewriter', 'Typing', 
+                'Vehicle', 'Video_game_sound', 'Water', 'Whimper_(dog)', 'Whip', 'Whispering', 'Whistle', 
+                'Whistling', 'Whoop', 'Wind', 'Writing', 'Yip', 'and_pans', 'bird_song', 'bleep', 'clink', 
+                'cock-a-doodle-doo', 'crinkling', 'dove', 'dribble', 'eructation', 'faucet', 'flapping_wings', 
+                'footsteps', 'gunfire', 'heartbeat', 'infant_cry', 'kid_speaking', 'man_speaking', 'mastication', 
+                'mice', 'river', 'rooster', 'silverware', 'skidding', 'smack', 'sobbing', 'speedboat', 'splatter',
+                'surf', 'thud', 'thwack', 'toot', 'truck_horn', 'tweet', 'vroom', 'waterfowl', 'woman_speaking']
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    '''
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=2).unsqueeze(2).repeat(1,1,3,1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=3).unsqueeze(3).repeat(1,1,1,3))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    '''
+    new_proj = torch.nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
+    new_proj.weight = torch.nn.Parameter(torch.sum(checkpoint['patch_embed1.proj.weight'], dim=1).unsqueeze(1))
+    checkpoint['patch_embed1.proj.weight'] = new_proj.weight
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    state_dict = OrderedDict({k.replace('backbone.',''):v for k,v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+def init_weights(m):
+    if isinstance(m, (nn.Conv2d, nn.Conv1d)):
+        nn.init.kaiming_normal_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.BatchNorm2d):
+        nn.init.constant_(m.weight, 1)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.Linear):
+        nn.init.kaiming_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+            
+    
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+class MaxPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        return torch.max(decision, dim=self.pooldim)[0]
+
+
+class LinearSoftPool(nn.Module):
+    """LinearSoftPool
+    Linear softmax, takes logits and returns a probability, near to the actual maximum value.
+    Taken from the paper:
+        A Comparison of Five Multiple Instance Learning Pooling Functions for Sound Event Detection with Weak Labeling
+    https://arxiv.org/abs/1810.09050
+
+    """
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, time_decision):
+        return (time_decision**2).sum(self.pooldim) / (time_decision.sum(
+            self.pooldim)+1e-7)
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        
+        super(ConvBlock, self).__init__()
+        
+        self.conv1 = nn.Conv2d(in_channels=in_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.conv2 = nn.Conv2d(in_channels=out_channels, 
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+                              
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.init_weight()
+        
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+
+        
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        
+        return x
+
+class ConvBlock_GLU(nn.Module):
+    def __init__(self, in_channels, out_channels,kernel_size=(3,3)):
+        super(ConvBlock_GLU, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels, 
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              padding=(1, 1), bias=False)                         
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.sigmoid = nn.Sigmoid()
+        self.init_weight()
+        
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_bn(self.bn1)
+
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = self.bn1(self.conv1(x))
+        cnn1 = self.sigmoid(x[:, :x.shape[1]//2, :, :])
+        cnn2 = x[:,x.shape[1]//2:,:,:]
+        x = cnn1*cnn2
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        elif pool_type == 'None':
+            pass
+        elif pool_type == 'LP':
+            pass
+            #nn.LPPool2d(4, pool_size)
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+
+class Mul_scale_GLU(nn.Module):
+    def __init__(self):
+        super(Mul_scale_GLU,self).__init__()
+        self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1)) # 1*1
+        self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3)) # 3*3
+        self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5)) # 5*5
+        self.conv_block2 = ConvBlock_GLU(in_channels=96, out_channels=128*2)
+        # self.conv_block3 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock_GLU(in_channels=128, out_channels=128*2)
+        self.conv_block4 = ConvBlock_GLU(in_channels=128, out_channels=256*2)
+        self.conv_block5 = ConvBlock_GLU(in_channels=256, out_channels=256*2)
+        self.conv_block6 = ConvBlock_GLU(in_channels=256, out_channels=512*2)
+        self.conv_block7 = ConvBlock_GLU(in_channels=512, out_channels=512*2)
+        self.padding = nn.ReplicationPad2d((0,1,0,1))
+
+    def forward(self, input, fi=None):
+        """
+        Input: (batch_size, data_length)"""
+        x1 = self.conv_block1_1(input, pool_size=(2, 2), pool_type='avg')
+        x1 = x1[:,:,:500,:32]
+        #print('x1 ',x1.shape)
+        x2 = self.conv_block1_2(input,pool_size=(2,2),pool_type='avg')
+        #print('x2 ',x2.shape)
+        x3 = self.conv_block1_3(input,pool_size=(2,2),pool_type='avg')
+        x3 = self.padding(x3)
+        #print('x3 ',x3.shape)
+        # assert 1==2
+        x = torch.cat([x1,x2],dim=1)
+        x = torch.cat([x,x3],dim=1)
+        #print('x ',x.shape)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='None')
+        x = self.conv_block3(x,pool_size=(2,2),pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training) # 
+        #print('x2,3 ',x.shape)
+        x = self.conv_block4(x, pool_size=(2, 4), pool_type='None')
+        x = self.conv_block5(x,pool_size=(2,4),pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        #print('x4,5 ',x.shape)
+
+        x = self.conv_block6(x, pool_size=(1, 4), pool_type='None')
+        x = self.conv_block7(x, pool_size=(1, 4), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        # print('x6,7 ',x.shape)
+        # assert 1==2
+        return x
+
+class Cnn14(nn.Module):
+    def __init__(self, sample_rate=32000, window_size=1024, hop_size=320, mel_bins=64, fmin=50, 
+        fmax=14000, classes_num=527):
+        
+        super(Cnn14, self).__init__()
+
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
+            freeze_parameters=True)
+
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
+            freeze_parameters=True)
+
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
+            freq_drop_width=8, freq_stripes_num=2)
+
+        self.bn0 = nn.BatchNorm2d(64)
+
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.fc1 = nn.Linear(2048, 128, bias=True)
+        self.fc_audioset = nn.Linear(128, classes_num, bias=True)
+        
+        self.init_weight()
+
+    def init_weight(self):
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+
+    def forward(self, input_, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)"""
+        input_ = input_.unsqueeze(1)
+        x = self.conv_block1(input_, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        # print(x.shape)
+        # x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2).contiguous().flatten(-2)
+        x = self.fc1(x)
+        # print(x.shape)
+        # assert 1==2
+        # (x1,_) = torch.max(x, dim=2)
+        # x2 = torch.mean(x, dim=2)
+        # x = x1 + x2
+        # x = F.dropout(x, p=0.5, training=self.training)
+        # x = F.relu_(self.fc1(x))
+        # embedding = F.dropout(x, p=0.5, training=self.training)
+        return x
+
+class Cnn10_fi(nn.Module):
+    def __init__(self):  
+        super(Cnn10_fi, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+
+        # self.fc1 = nn.Linear(512, 512, bias=True)
+        # self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+        
+        # self.init_weight()
+ 
+    def forward(self, input, fi=None):
+        """
+        Input: (batch_size, data_length)"""
+
+        x = self.conv_block1(input, pool_size=(2, 2), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 4), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 4), pool_type='avg')
+        if fi != None:
+            gamma = fi[:,0].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            beta = fi[:,1].unsqueeze(1).unsqueeze(2).unsqueeze(3).expand_as(x)
+            x = (gamma)*x + beta
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+
+class Cnn10_mul_scale(nn.Module):
+    def __init__(self,scale=8):  
+        super(Cnn10_mul_scale, self).__init__()
+        self.conv_block1_1 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(1,1))
+        self.conv_block1_2 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(3,3))
+        self.conv_block1_3 = ConvBlock_GLU(in_channels=1, out_channels=64,kernel_size=(5,5))
+        self.conv_block2 = ConvBlock(in_channels=96, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.scale = scale
+        self.padding = nn.ReplicationPad2d((0,1,0,1))
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Input: (batch_size, data_length)"""
+        if self.scale == 8:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (2,4)
+            pool_size4 = (1,4)
+        elif self.scale == 4:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        elif self.scale == 2:
+            pool_size1 = (2,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        else:
+            pool_size1 = (1,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        # print('input ',input.shape)
+        x1 = self.conv_block1_1(input, pool_size=pool_size1, pool_type='avg')
+        x1 = x1[:,:,:500,:32]
+        #print('x1 ',x1.shape)
+        x2 = self.conv_block1_2(input, pool_size=pool_size1, pool_type='avg')
+        #print('x2 ',x2.shape)
+        x3 = self.conv_block1_3(input, pool_size=pool_size1, pool_type='avg')
+        x3 = self.padding(x3)
+        #print('x3 ',x3.shape)
+        # assert 1==2
+        m_i = min(x3.shape[2],min(x1.shape[2],x2.shape[2]))
+        #print('m_i ', m_i)
+        x = torch.cat([x1[:,:,:m_i,:],x2[:,:, :m_i,:],x3[:,:, :m_i,:]],dim=1)
+        # x = torch.cat([x,x3],dim=1)
+
+        # x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+
+
+class Cnn10(nn.Module):
+    def __init__(self,scale=8):  
+        super(Cnn10, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.scale = scale
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        """
+        Input: (batch_size, data_length)"""
+        if self.scale == 8:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (2,4)
+            pool_size4 = (1,4)
+        elif self.scale == 4:
+            pool_size1 = (2,2)
+            pool_size2 = (2,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        elif self.scale == 2:
+            pool_size1 = (2,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        else:
+            pool_size1 = (1,2)
+            pool_size2 = (1,2)
+            pool_size3 = (1,4)
+            pool_size4 = (1,4)
+        x = self.conv_block1(input, pool_size=pool_size1, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=pool_size2, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=pool_size3, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=pool_size4, pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        return x
+
+class MeanPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        return torch.mean(decision, dim=self.pooldim)
+
+class ResPool(nn.Module):
+    def __init__(self, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+        self.linPool = LinearSoftPool(pooldim=1)
+
+class AutoExpPool(nn.Module):
+    def __init__(self, outputdim=10, pooldim=1):
+        super().__init__()
+        self.outputdim = outputdim
+        self.alpha = nn.Parameter(torch.full((outputdim, ), 1))
+        self.pooldim = pooldim
+
+    def forward(self, logits, decision):
+        scaled = self.alpha * decision  # \alpha * P(Y|x) in the paper
+        return (logits * torch.exp(scaled)).sum(
+            self.pooldim) / torch.exp(scaled).sum(self.pooldim)
+
+
+class SoftPool(nn.Module):
+    def __init__(self, T=1, pooldim=1):
+        super().__init__()
+        self.pooldim = pooldim
+        self.T = T
+
+    def forward(self, logits, decision):
+        w = torch.softmax(decision / self.T, dim=self.pooldim)
+        return torch.sum(decision * w, dim=self.pooldim)
+
+
+class AutoPool(nn.Module):
+    """docstring for AutoPool"""
+    def __init__(self, outputdim=10, pooldim=1):
+        super().__init__()
+        self.outputdim = outputdim
+        self.alpha = nn.Parameter(torch.ones(outputdim))
+        self.dim = pooldim
+
+    def forward(self, logits, decision):
+        scaled = self.alpha * decision  # \alpha * P(Y|x) in the paper
+        weight = torch.softmax(scaled, dim=self.dim)
+        return torch.sum(decision * weight, dim=self.dim)  # B x C
+
+
+class ExtAttentionPool(nn.Module):
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.attention = nn.Linear(inputdim, outputdim)
+        nn.init.zeros_(self.attention.weight)
+        nn.init.zeros_(self.attention.bias)
+        self.activ = nn.Softmax(dim=self.pooldim)
+
+    def forward(self, logits, decision):
+        # Logits of shape (B, T, D), decision of shape (B, T, C)
+        w_x = self.activ(self.attention(logits) / self.outputdim)
+        h = (logits.permute(0, 2, 1).contiguous().unsqueeze(-2) *
+             w_x.unsqueeze(-1)).flatten(-2).contiguous()
+        return torch.sum(h, self.pooldim)
+
+
+class AttentionPool(nn.Module):
+    """docstring for AttentionPool"""
+    def __init__(self, inputdim, outputdim=10, pooldim=1, **kwargs):
+        super().__init__()
+        self.inputdim = inputdim
+        self.outputdim = outputdim
+        self.pooldim = pooldim
+        self.transform = nn.Linear(inputdim, outputdim)
+        self.activ = nn.Softmax(dim=self.pooldim)
+        self.eps = 1e-7
+
+    def forward(self, logits, decision):
+        # Input is (B, T, D)
+        # B, T , D
+        w = self.activ(torch.clamp(self.transform(logits), -15, 15))
+        detect = (decision * w).sum(
+            self.pooldim) / (w.sum(self.pooldim) + self.eps)
+        # B, T, D
+        return detect
+
+class Block2D(nn.Module):
+    def __init__(self, cin, cout, kernel_size=3, padding=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.BatchNorm2d(cin),
+            nn.Conv2d(cin,
+                      cout,
+                      kernel_size=kernel_size,
+                      padding=padding,
+                      bias=False),
+            nn.LeakyReLU(inplace=True, negative_slope=0.1))
+
+    def forward(self, x):
+        return self.block(x)
+
+class AudioCNN(nn.Module):
+    def __init__(self, classes_num):
+        super(AudioCNN, self).__init__()
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.fc1 = nn.Linear(512,128,bias=True)
+        self.fc = nn.Linear(128, classes_num, bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.fc)
+
+    def forward(self, input):
+        '''
+        Input: (batch_size, times_steps, freq_bins)'''
+        # [128, 801, 168] --> [128,1,801,168]
+        x = input[:, None, :, :]
+        '''(batch_size, 1, times_steps, freq_bins)'''
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') # 128,64,400,84
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') # 128,128,200,42
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') # 128,256,100,21
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') # 128,512,50,10
+        '''(batch_size, feature_maps, time_steps, freq_bins)'''
+        x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes) # 128,512,50
+        (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps) 128,512
+        x = self.fc1(x) # 128,128
+        output = self.fc(x) # 128,10
+        return x,output
+
+    def extract(self,input):
+        '''Input: (batch_size, times_steps, freq_bins)'''
+        x = input[:, None, :, :]
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        '''(batch_size, feature_maps, time_steps, freq_bins)'''
+        x = torch.mean(x, dim=3)        # (batch_size, feature_maps, time_stpes)
+        (x, _) = torch.max(x, dim=2)    # (batch_size, feature_maps)
+        x = self.fc1(x) # 128,128
+        return x
+
+def parse_poolingfunction(poolingfunction_name='mean', **kwargs):
+    """parse_poolingfunction
+    A heler function to parse any temporal pooling
+    Pooling is done on dimension 1
+
+    :param poolingfunction_name:
+    :param **kwargs:
+    """
+    poolingfunction_name = poolingfunction_name.lower()
+    if poolingfunction_name == 'mean':
+        return MeanPool(pooldim=1)
+    elif poolingfunction_name == 'max':
+        return MaxPool(pooldim=1)
+    elif poolingfunction_name == 'linear':
+        return LinearSoftPool(pooldim=1)
+    elif poolingfunction_name == 'expalpha':
+        return AutoExpPool(outputdim=kwargs['outputdim'], pooldim=1)
+
+    elif poolingfunction_name == 'soft':
+        return SoftPool(pooldim=1)
+    elif poolingfunction_name == 'auto':
+        return AutoPool(outputdim=kwargs['outputdim'])
+    elif poolingfunction_name == 'attention':
+        return AttentionPool(inputdim=kwargs['inputdim'],
+                             outputdim=kwargs['outputdim'])
+class conv1d(nn.Module):
+    def __init__(self, nin, nout, kernel_size=3, stride=1, padding='VALID', dilation=1):
+        super(conv1d, self).__init__()
+        if padding == 'VALID':
+            dconv_pad = 0
+        elif padding == 'SAME':
+            dconv_pad = dilation * ((kernel_size - 1) // 2)
+        else:
+            raise ValueError("Padding Mode Error!")
+        self.conv = nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride, padding=dconv_pad)
+        self.act = nn.ReLU()
+        self.init_layer(self.conv)
+
+    def init_layer(self, layer, nonlinearity='relu'):
+        """Initialize a Linear or Convolutional layer. """
+        nn.init.kaiming_normal_(layer.weight, nonlinearity=nonlinearity)
+        nn.init.constant_(layer.bias, 0.1)
+
+    def forward(self, x):
+        out = self.act(self.conv(x))
+        return out
+
+class Atten_1(nn.Module):
+    def __init__(self, input_dim, context=2, dropout_rate=0.2):
+        super(Atten_1, self).__init__()
+        self._matrix_k = nn.Linear(input_dim, input_dim // 4)
+        self._matrix_q = nn.Linear(input_dim, input_dim // 4)
+        self.relu = nn.ReLU()
+        self.context = context
+        self._dropout_layer = nn.Dropout(dropout_rate)
+        self.init_layer(self._matrix_k)
+        self.init_layer(self._matrix_q)
+
+    def init_layer(self, layer, nonlinearity='leaky_relu'):
+        """Initialize a Linear or Convolutional layer. """
+        nn.init.kaiming_uniform_(layer.weight, nonlinearity=nonlinearity)
+        if hasattr(layer, 'bias'):
+            if layer.bias is not None:
+                layer.bias.data.fill_(0.)
+
+    def forward(self, input_x):
+        k_x = input_x
+        k_x = self.relu(self._matrix_k(k_x))
+        k_x = self._dropout_layer(k_x)
+        # print('k_x ',k_x.shape)
+        q_x = input_x[:, self.context, :]
+        # print('q_x ',q_x.shape)
+        q_x = q_x[:, None, :]
+        # print('q_x1 ',q_x.shape)
+        q_x = self.relu(self._matrix_q(q_x))
+        q_x = self._dropout_layer(q_x)
+        # print('q_x2 ',q_x.shape)
+        x_ = torch.matmul(k_x, q_x.transpose(-2, -1) / math.sqrt(k_x.size(-1)))
+        # print('x_ ',x_.shape)
+        x_ = x_.squeeze(2)
+        alpha = F.softmax(x_, dim=-1)
+        att_ = alpha
+        # print('alpha ',alpha)
+        alpha = alpha.unsqueeze(2).repeat(1,1,input_x.shape[2])
+        # print('alpha ',alpha)
+        # alpha = alpha.view(alpha.size(0), alpha.size(1), alpha.size(2), 1)
+        out = alpha * input_x
+        # print('out ', out.shape)
+        # out = out.mean(2)
+        out = out.mean(1)
+        # print('out ',out.shape)
+        # assert 1==2
+        #y = alpha * input_x
+        #return y, att_
+        out = input_x[:, self.context, :] + out
+        return out
+
+class Fusion(nn.Module):
+    def __init__(self, inputdim, inputdim2, n_fac):
+        super().__init__()
+        self.fuse_layer1 = conv1d(inputdim, inputdim2*n_fac,1)
+        self.fuse_layer2 = conv1d(inputdim2, inputdim2*n_fac,1)
+        self.avg_pool = nn.AvgPool1d(n_fac, stride=n_fac) # 沿着最后一个维度进行pooling
+
+    def forward(self,embedding,mix_embed):
+        embedding = embedding.permute(0,2,1)
+        fuse1_out = self.fuse_layer1(embedding) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度 
+        fuse1_out = fuse1_out.permute(0,2,1)
+
+        mix_embed = mix_embed.permute(0,2,1)
+        fuse2_out = self.fuse_layer2(mix_embed) # [2, 501, 2560] ,512*5, 1D卷积融合,spk_embeding ,扩大其维度 
+        fuse2_out = fuse2_out.permute(0,2,1)
+        as_embs = torch.mul(fuse1_out, fuse2_out) # 相乘 [2, 501, 2560]
+        # (10, 501, 512)
+        as_embs = self.avg_pool(as_embs) # [2, 501, 512] 相当于 2560//5
+        return as_embs
+
+class CDur_fusion(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+        self.gru = nn.GRU(128, 128, bidirectional=True, batch_first=True)
+        self.fusion = Fusion(128,2)
+        self.fc = nn.Linear(256,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+
+    def forward(self, x, embedding): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = self.fusion(embedding,x)
+        #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 32),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(32, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(128, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 4)),
+            nn.Dropout(0.3),
+        )
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+
+        self.gru = nn.GRU(256, 256, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(512,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+
+    def forward(self, x, embedding,one_hot=None): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_big(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = nn.Sequential(
+            Block2D(1, 64),
+            Block2D(64, 64),
+            nn.LPPool2d(4, (2, 2)),
+            Block2D(64, 128),
+            Block2D(128, 128),
+            nn.LPPool2d(4, (2, 2)),
+            Block2D(128, 256),
+            Block2D(256, 256),
+            nn.LPPool2d(4, (2, 4)),
+            Block2D(256, 512),
+            Block2D(512, 512),
+            nn.LPPool2d(4, (1, 4)),
+            nn.Dropout(0.3),)
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+
+    def forward(self, x, embedding): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_GLU(nn.Module):
+    def __init__(self, inputdim, outputdim, **kwargs):
+        super().__init__()
+        self.features = Mul_scale_GLU()
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        self.gru = nn.GRU(640, 512,1, bidirectional=True, batch_first=True) # previous is 640
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+        
+    def forward(self, x, embedding,one_hot=None): # 
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_CNN14(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10(4)
+        elif time_resolution == 500:
+            self.features = Cnn10(2)
+        else:
+            self.features = Cnn10(0)
+        with torch.no_grad():
+            rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+            rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    
+    def forward(self, x, embedding,one_hot=None):
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_CNN_mul_scale(nn.Module):
+    def __init__(self, inputdim, outputdim,time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10_mul_scale(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10_mul_scale(4)
+        elif time_resolution == 500:
+            self.features = Cnn10_mul_scale(2)
+        else:
+            self.features = Cnn10_mul_scale(0)
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(640, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    
+    def forward(self, x, embedding,one_hot=None):
+        # print('x ',x.shape)
+        # assert 1==2
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+class CDur_CNN_mul_scale_fusion(nn.Module):
+    def __init__(self, inputdim, outputdim, time_resolution,**kwargs):
+        super().__init__()
+        if time_resolution==125:
+            self.features = Cnn10_mul_scale(8)
+        elif time_resolution == 250:
+            #print('time_resolution ',time_resolution)
+            self.features = Cnn10_mul_scale(4)
+        elif time_resolution == 500:
+            self.features = Cnn10_mul_scale(2)
+        else:
+            self.features = Cnn10_mul_scale(0)
+        # with torch.no_grad():
+        #     rnn_input_dim = self.features(torch.randn(1, 1, 500,inputdim)).shape
+        #     rnn_input_dim = rnn_input_dim[1] * rnn_input_dim[-1]
+        # self.features = Cnn10()
+        self.gru = nn.GRU(512, 512, bidirectional=True, batch_first=True)
+        # self.gru = LSTMModel(640, 512,1)
+        self.fc = nn.Linear(1024,256)
+        self.fusion = Fusion(128,512,2)
+        self.outputlayer = nn.Linear(256, outputdim)
+        # self.features.apply(init_weights)
+        self.outputlayer.apply(init_weights)
+    
+    def forward(self, x, embedding,one_hot=None):
+        # print('x ',x.shape)
+        # assert 1==2
+        batch, time, dim = x.shape
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,512)
+        # print('x ',x.shape)
+        # assert 1==2
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        x = self.fusion(embedding, x)
+        #x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.gru.flatten_parameters()
+        x, _ = self.gru(x) #  x  torch.Size([16, 125, 256])
+        # x = self.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.fc(x)
+        decision_time = torch.softmax(self.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0],decision_up
+
+
+class RaDur_fusion(nn.Module):
+    def __init__(self, model_config, inputdim, outputdim, time_resolution, **kwargs):
+        super().__init__()
+        self.encoder = Cnn14()
+        self.detection = CDur_CNN_mul_scale_fusion(inputdim, outputdim, time_resolution)
+        self.softmax = nn.Softmax(dim=2)
+        #self.temperature = 5
+        if model_config['pre_train']:
+            self.encoder.load_state_dict(torch.load(model_config['encoder_path'])['model'])
+            self.detection.load_state_dict(torch.load(model_config['CDur_path']))
+        
+        self.q = nn.Linear(128,128)
+        self.k = nn.Linear(128,128)
+        self.q_ee = nn.Linear(128, 128)
+        self.k_ee = nn.Linear(128, 128)
+        self.temperature = 11.3 # sqrt(128)
+        self.att_pool = model_config['att_pool']
+        self.enhancement = model_config['enhancement'] 
+        self.tao = model_config['tao']
+        self.top = model_config['top']
+        self.bn = nn.BatchNorm1d(128)
+        self.EE_fusion = Fusion(128, 128, 4)
+
+    def get_w(self,q,k):
+        q = self.q(q)
+        k = self.k(k)
+        q = q.unsqueeze(1)
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn/self.temperature
+        attn = self.softmax(attn)
+        return attn
+    
+    def get_w_ee(self,q,k):
+        q = self.q_ee(q)
+        k = self.k_ee(k)
+        q = q.unsqueeze(1)
+        attn = torch.bmm(q, k.transpose(1, 2))
+        attn = attn/self.temperature
+        attn = self.softmax(attn)
+        return attn
+    
+    def attention_pooling(self, embeddings, mean_embedding):
+        att_pool_w = self.get_w(mean_embedding,embeddings)
+        embedding = torch.bmm(att_pool_w, embeddings).squeeze(1)
+        # print(embedding.shape)
+        # print(att_pool_w.shape)
+        # print(att_pool_w[0])
+        # assert 1==2
+        return embedding
+    
+    def select_topk_embeddings(self, scores, embeddings, k):
+        _, idx_DESC = scores.sort(descending=True, dim=1) # 根据分数进行排序
+        top_k = _[:,:k]
+        # print('top_k ', top_k)
+        # top_k = top_k.mean(1)
+        idx_topk = idx_DESC[:, :k] # 取top_k个
+        # print('index ', idx_topk)
+        idx_topk = idx_topk.unsqueeze(2).expand([-1, -1, embeddings.shape[2]])
+        selected_embeddings = torch.gather(embeddings, 1, idx_topk)
+        return selected_embeddings,top_k
+    
+    def sum_with_attention(self, embedding, top_k, selected_embeddings):
+        # print('embedding ',embedding)
+        # print('selected_embeddings ',selected_embeddings.shape)
+        att_1 = self.get_w_ee(embedding, selected_embeddings)
+        att_1 = att_1.squeeze(1)
+        #print('att_1 ',att_1.shape)
+        larger = top_k > self.tao
+        # print('larger ',larger)
+        top_k = top_k*larger
+        # print('top_k ',top_k.shape)
+        # print('top_k ',top_k)
+        att_1 = att_1*top_k
+        #print('att_1 ',att_1.shape)
+        # assert 1==2
+        att_2 = att_1.unsqueeze(2).repeat(1,1,128)
+        Es = selected_embeddings*att_2
+        return Es
+    
+    def orcal_EE(self, x, embedding, label):
+        batch, time, dim = x.shape
+
+        mixture_embedding = self.encoder(x) # 8, 125, 128
+        mixture_embedding = mixture_embedding.transpose(1,2)
+        mixture_embedding = self.bn(mixture_embedding)
+        mixture_embedding = mixture_embedding.transpose(1,2)
+
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.detection.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding_pre = embedding.unsqueeze(1)
+        embedding_pre = embedding_pre.repeat(1, x.shape[1], 1)
+        f = self.detection.fusion(embedding_pre, x) # the first stage results
+        #f = torch.cat((x, embedding_pre), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.detection.gru.flatten_parameters()
+        f, _ = self.detection.gru(f) #  x  torch.Size([16, 125, 256])
+        f = self.detection.fc(f)
+        decision_time = torch.softmax(self.detection.outputlayer(f),dim=2) # x  torch.Size([16, 125, 2])
+        
+        selected_embeddings, top_k = self.select_topk_embeddings(decision_time[:,:,0], mixture_embedding, self.top)
+        
+        selected_embeddings = self.sum_with_attention(embedding, top_k, selected_embeddings) # add the weight
+
+        mix_embedding = selected_embeddings.mean(1).unsqueeze(1) # 
+        mix_embedding = mix_embedding.repeat(1, x.shape[1], 1)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        mix_embedding = self.EE_fusion(mix_embedding, embedding) # 使用神经网络进行融合
+        # mix_embedding2 = selected_embeddings2.mean(1)
+        #mix_embedding =  embedding + mix_embedding # 直接相加
+        # new detection results
+        # embedding_now = mix_embedding.unsqueeze(1)
+        # embedding_now = embedding_now.repeat(1, x.shape[1], 1)
+        f_now = self.detection.fusion(mix_embedding, x) 
+        #f_now = torch.cat((x, embedding_now), dim=2) # 
+        f_now, _ = self.detection.gru(f_now) #  x  torch.Size([16, 125, 256])
+        f_now = self.detection.fc(f_now)
+        decision_time_now = torch.softmax(self.detection.outputlayer(f_now), dim=2) # x  torch.Size([16, 125, 2])
+        
+        top_k = top_k.mean(1)  # get avg score,higher score will have more weight
+        larger = top_k > self.tao
+        top_k = top_k * larger
+        top_k = top_k/2.0
+        # print('top_k ',top_k)
+        # assert 1==2
+        # print('tok_k[ ',top_k.shape)
+        # print('decision_time ',decision_time.shape)
+        # print('decision_time_now ',decision_time_now.shape)
+        neg_w = top_k.unsqueeze(1).unsqueeze(2)
+        neg_w = neg_w.repeat(1, decision_time_now.shape[1], decision_time_now.shape[2])
+        # print('neg_w ',neg_w.shape)
+        #print('neg_w ',neg_w[:,0:10,0])
+        pos_w = 1-neg_w
+        #print('pos_w ',pos_w[:,0:10,0])
+        decision_time_final = decision_time*pos_w + neg_w*decision_time_now
+        #print('decision_time_final ',decision_time_final[0,0:10,0])
+        # print(decision_time_final[0,:,:])
+        #assert 1==2
+        return decision_time_final
+    
+    def forward(self, x, ref, label=None):
+        batch, time, dim = x.shape
+        logit = torch.zeros(1).cuda()
+        embeddings  = self.encoder(ref)
+        mean_embedding = embeddings.mean(1)
+        if self.att_pool == True:
+            mean_embedding = self.bn(mean_embedding)
+            embeddings = embeddings.transpose(1,2)
+            embeddings = self.bn(embeddings)
+            embeddings = embeddings.transpose(1,2)
+            embedding = self.attention_pooling(embeddings, mean_embedding)
+        else:
+            embedding = mean_embedding
+        if self.enhancement == True:
+            decision_time = self.orcal_EE(x, embedding, label)
+            decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2), # [16, 2, 125]
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+            return decision_time[:,:,0], decision_up, logit
+
+        x = x.unsqueeze(1) # (b,1,t,d) 
+        x = self.detection.features(x) # 
+        x = x.transpose(1, 2).contiguous().flatten(-2) # 重新拷贝一份x,之后推平-2:-1之间的维度 # (b,125,128)
+        embedding = embedding.unsqueeze(1)
+        embedding = embedding.repeat(1, x.shape[1], 1)
+        # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        x = self.detection.fusion(embedding, x) 
+        # embedding = embedding.unsqueeze(1)
+        # embedding = embedding.repeat(1, x.shape[1], 1)
+        # x = torch.cat((x, embedding), dim=2) # [B, T, 128 + emb_dim]
+        if not hasattr(self, '_flattened'):
+            self.detection.gru.flatten_parameters()
+        x, _ = self.detection.gru(x) #  x  torch.Size([16, 125, 256])
+        x = self.detection.fc(x)
+        decision_time = torch.softmax(self.detection.outputlayer(x),dim=2) # x  torch.Size([16, 125, 2])
+        decision_up = torch.nn.functional.interpolate(
+                decision_time.transpose(1, 2),
+                time, # 501
+                mode='linear',
+                align_corners=False).transpose(1, 2) # 从125插值回 501 ?--> (16,501,2)
+        return decision_time[:,:,0], decision_up, logit
diff --git a/audio_detection/target_sound_detection/src/utils.py b/audio_detection/target_sound_detection/src/utils.py
new file mode 100644
index 0000000..cf1deea
--- /dev/null
+++ b/audio_detection/target_sound_detection/src/utils.py
@@ -0,0 +1,353 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2021/3/9 16:33
+# @Author  : dongchao yang
+# @File    : train.py
+
+import collections
+import sys
+from loguru import logger
+from pprint import pformat
+
+import numpy as np
+import pandas as pd
+import scipy
+import six
+import sklearn.preprocessing as pre
+import torch
+import tqdm
+import yaml
+
+from scipy.interpolate import interp1d
+
+def parse_config_or_kwargs(config_file, **kwargs):
+    """parse_config_or_kwargs
+    :param config_file: Config file that has parameters, yaml format
+    :param **kwargs: Other alternative parameters or overwrites for config
+    """
+    with open(config_file) as con_read:
+        yaml_config = yaml.load(con_read, Loader=yaml.FullLoader)
+    arguments = dict(yaml_config, **kwargs)
+    return arguments
+
+
+def find_contiguous_regions(activity_array): # in this part, if you cannot understand the binary operation, I think you can write a O(n) complexity method
+    """Find contiguous regions from bool valued numpy.array.
+    Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder
+    Reason is:
+    1. This does not belong to a class necessarily
+    2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters
+    """
+    change_indices = np.logical_xor(activity_array[1:], activity_array[:-1]).nonzero()[0] 
+    change_indices += 1
+    if activity_array[0]:
+        # If the first element of activity_array is True add 0 at the beginning
+        change_indices = np.r_[0, change_indices]
+
+    if activity_array[-1]:
+        # If the last element of activity_array is True, add the length of the array
+        change_indices = np.r_[change_indices, activity_array.size]
+    # print(change_indices.reshape((-1, 2)))
+    # Reshape the result into two columns
+    return change_indices.reshape((-1, 2))
+
+
+def split_train_cv(
+        data_frame: pd.DataFrame,
+        frac: float = 0.9,
+        y=None,  # Only for stratified, computes necessary split
+        **kwargs):
+    """split_train_cv
+
+    :param data_frame:
+    :type data_frame: pd.DataFrame
+    :param frac:
+    :type frac: float
+    """
+    if kwargs.get('mode',
+                  None) == 'urbansed':  # Filenames are DATA_-1 DATA_-2 etc
+        data_frame.loc[:, 'id'] = data_frame.groupby(
+            data_frame['filename'].str.split('_').apply(
+                lambda x: '_'.join(x[:-1]))).ngroup()
+        sampler = np.random.permutation(data_frame['id'].nunique())
+        num_train = int(frac * len(sampler))
+        train_indexes = sampler[:num_train]
+        cv_indexes = sampler[num_train:]
+        train_data = data_frame[data_frame['id'].isin(train_indexes)]
+        cv_data = data_frame[data_frame['id'].isin(cv_indexes)]
+        del train_data['id']
+        del cv_data['id']
+    elif kwargs.get('mode', None) == 'stratified': #  stratified --> 分层的 ?
+        # Use statified sampling
+        from skmultilearn.model_selection import iterative_train_test_split
+        index_train, _, index_cv, _ = iterative_train_test_split(
+            data_frame.index.values.reshape(-1, 1), y, test_size=1. - frac)
+        train_data = data_frame[data_frame.index.isin(index_train.squeeze())]
+        cv_data = data_frame[data_frame.index.isin(index_cv.squeeze())] # cv --> cross validation
+    else:
+        # Simply split train_test
+        train_data = data_frame.sample(frac=frac, random_state=10)
+        cv_data = data_frame[~data_frame.index.isin(train_data.index)]
+    return train_data, cv_data
+
+
+
+def pprint_dict(in_dict, outputfun=sys.stdout.write, formatter='yaml'): # print yaml file
+    """pprint_dict
+    :param outputfun: function to use, defaults to sys.stdout
+    :param in_dict: dict to print
+    """
+    if formatter == 'yaml':
+        format_fun = yaml.dump
+    elif formatter == 'pretty':
+        format_fun = pformat
+    for line in format_fun(in_dict).split('\n'):
+        outputfun(line)
+
+
+def getfile_outlogger(outputfile):
+    log_format = "[<green>{time:YYYY-MM-DD HH:mm:ss}</green>] {message}"
+    logger.configure(handlers=[{"sink": sys.stderr, "format": log_format}])
+    if outputfile:
+        logger.add(outputfile, enqueue=True, format=log_format)
+    return logger
+
+# according label, get encoder
+def train_labelencoder(labels: pd.Series, sparse=True):
+    """encode_labels
+
+    Encodes labels
+
+    :param labels: pd.Series representing the raw labels e.g., Speech, Water
+    :param encoder (optional): Encoder already fitted 
+    returns encoded labels (many hot) and the encoder
+    """
+    assert isinstance(labels, pd.Series), "Labels need to be series"
+    if isinstance(labels[0], six.string_types):
+        # In case of using non processed strings, e.g., Vaccum, Speech
+        label_array = labels.str.split(',').values.tolist() # split label according to ','
+    elif isinstance(labels[0], np.ndarray):
+        # Encoder does not like to see numpy array
+        label_array = [lab.tolist() for lab in labels]
+    elif isinstance(labels[0], collections.Iterable):
+        label_array = labels
+    encoder = pre.MultiLabelBinarizer(sparse_output=sparse)
+    encoder.fit(label_array)
+    return encoder
+
+
+def encode_labels(labels: pd.Series, encoder=None, sparse=True):
+    """encode_labels
+
+    Encodes labels
+
+    :param labels: pd.Series representing the raw labels e.g., Speech, Water
+    :param encoder (optional): Encoder already fitted 
+    returns encoded labels (many hot) and the encoder
+    """
+    assert isinstance(labels, pd.Series), "Labels need to be series"
+    instance = labels.iloc[0]
+    if isinstance(instance, six.string_types):
+        # In case of using non processed strings, e.g., Vaccum, Speech
+        label_array = labels.str.split(',').values.tolist()
+    elif isinstance(instance, np.ndarray):
+        # Encoder does not like to see numpy array
+        label_array = [lab.tolist() for lab in labels]
+    elif isinstance(instance, collections.Iterable):
+        label_array = labels
+    # get label_array, it is a list ,contain a lot of label, this label are string type
+    if not encoder:
+        encoder = pre.MultiLabelBinarizer(sparse_output=sparse) # if we encoder is None, we should init a encoder firstly.
+        encoder.fit(label_array)
+    labels_encoded = encoder.transform(label_array) # transform string to digit
+    return labels_encoded, encoder
+
+    # return pd.arrays.SparseArray(
+    # [row.toarray().ravel() for row in labels_encoded]), encoder
+
+
+def decode_with_timestamps(events,labels: np.array):
+    """decode_with_timestamps
+    Decodes the predicted label array (2d) into a list of
+    [(Labelname, onset, offset), ...]
+
+    :param encoder: Encoder during training
+    :type encoder: pre.MultiLabelBinarizer
+    :param labels: n-dim array
+    :type labels: np.array
+    """
+    # print('events ',events)
+    # print('labels ',labels.shape)
+    #assert 1==2
+    if labels.ndim == 2:
+        #print('...')
+        return [_decode_with_timestamps(events[i],labels[i]) for i in range(labels.shape[0])]
+    else:
+        return _decode_with_timestamps(events,labels)
+
+
+def median_filter(x, window_size, threshold=0.5):
+    """median_filter
+    :param x: input prediction array of shape (B, T, C) or (B, T).
+        Input is a sequence of probabilities 0 <= x <= 1
+    :param window_size: An integer to use 
+    :param threshold: Binary thresholding threshold
+    """
+    x = binarize(x, threshold=threshold) # transfer to 0 or 1
+    if x.ndim == 3:
+        size = (1, window_size, 1)
+    elif x.ndim == 2 and x.shape[0] == 1:
+        # Assume input is class-specific median filtering
+        # E.g, Batch x Time  [1, 501]
+        size = (1, window_size)
+    elif x.ndim == 2 and x.shape[0] > 1:
+        # Assume input is standard median pooling, class-independent
+        # E.g., Time x Class [501, 10]
+        size = (window_size, 1)
+    return scipy.ndimage.median_filter(x, size=size)
+
+
+def _decode_with_timestamps(events,labels):
+    result_labels = []
+    # print('.......')
+    # print('labels ',labels.shape)
+    # print(labels)
+    change_indices = find_contiguous_regions(labels)
+    # print(change_indices)
+    # assert 1==2
+    for row in change_indices:
+        result_labels.append((events,row[0], row[1]))
+    return result_labels
+
+def inverse_transform_labels(encoder, pred):
+    if pred.ndim == 3:
+        return [encoder.inverse_transform(x) for x in pred]
+    else:
+        return encoder.inverse_transform(pred)
+
+
+def binarize(pred, threshold=0.5):
+    # Batch_wise
+    if pred.ndim == 3:
+        return np.array(
+            [pre.binarize(sub, threshold=threshold) for sub in pred])
+    else:
+        return pre.binarize(pred, threshold=threshold)
+
+
+def double_threshold(x, high_thres, low_thres, n_connect=1):
+    """double_threshold
+    Helper function to calculate double threshold for n-dim arrays
+
+    :param x: input array
+    :param high_thres: high threshold value
+    :param low_thres: Low threshold value
+    :param n_connect: Distance of <= n clusters will be merged
+    """
+    assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format(
+        x.shape)
+    if x.ndim == 3:
+        apply_dim = 1
+    elif x.ndim < 3:
+        apply_dim = 0
+    # x is assumed to be 3d: (batch, time, dim)
+    # Assumed to be 2d : (time, dim)
+    # Assumed to be 1d : (time)
+    # time axis is therefore at 1 for 3d and 0 for 2d (
+    return np.apply_along_axis(lambda x: _double_threshold(
+        x, high_thres, low_thres, n_connect=n_connect),
+                               axis=apply_dim,
+                               arr=x)
+
+
+def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True): # in nature, double_threshold considers boundary question
+    """_double_threshold
+    Computes a double threshold over the input array
+
+    :param x: input array, needs to be 1d
+    :param high_thres: High threshold over the array
+    :param low_thres: Low threshold over the array
+    :param n_connect: Postprocessing, maximal distance between clusters to connect
+    :param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros.
+    """
+    assert x.ndim == 1, "Input needs to be 1d"
+    high_locations = np.where(x > high_thres)[0] # return the index, where value is greater than high_thres
+    locations = x > low_thres # return true of false
+    encoded_pairs = find_contiguous_regions(locations)
+    # print('encoded_pairs ',encoded_pairs)
+    filtered_list = list(
+        filter(
+            lambda pair:
+            ((pair[0] <= high_locations) & (high_locations <= pair[1])).any(),
+            encoded_pairs)) # find encoded_pair where inclide a high_lacations
+    #print('filtered_list ',filtered_list)
+    filtered_list = connect_(filtered_list, n_connect) # if the distance of two pair is less than n_connect, we can merge them
+    if return_arr:
+        zero_one_arr = np.zeros_like(x, dtype=int)
+        for sl in filtered_list:
+            zero_one_arr[sl[0]:sl[1]] = 1
+        return zero_one_arr
+    return filtered_list
+
+
+def connect_clusters(x, n=1):
+    if x.ndim == 1:
+        return connect_clusters_(x, n)
+    if x.ndim >= 2:
+        return np.apply_along_axis(lambda a: connect_clusters_(a, n=n), -2, x)
+
+
+def connect_clusters_(x, n=1):
+    """connect_clusters_
+    Connects clustered predictions (0,1) in x with range n
+
+    :param x: Input array. zero-one format
+    :param n: Number of frames to skip until connection can be made
+    """
+    assert x.ndim == 1, "input needs to be 1d"
+    reg = find_contiguous_regions(x)
+    start_end = connect_(reg, n=n)
+    zero_one_arr = np.zeros_like(x, dtype=int)
+    for sl in start_end:
+        zero_one_arr[sl[0]:sl[1]] = 1
+    return zero_one_arr
+
+
+def connect_(pairs, n=1):
+    """connect_
+    Connects two adjacent clusters if their distance is <= n
+
+    :param pairs: Clusters of iterateables e.g., [(1,5),(7,10)]
+    :param n: distance between two clusters 
+    """
+    if len(pairs) == 0:
+        return []
+    start_, end_ = pairs[0]
+    new_pairs = []
+    for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])):
+        end_ = next_item[1]
+        if next_item[0] - cur_item[1] <= n:
+            pass
+        else:
+            new_pairs.append((start_, cur_item[1]))
+            start_ = next_item[0]
+    new_pairs.append((start_, end_))
+    return new_pairs
+
+
+def predictions_to_time(df, ratio):
+    df.onset = df.onset * ratio
+    df.offset = df.offset * ratio
+    return df
+
+def upgrade_resolution(arr, scale):
+    print('arr ',arr.shape)
+    x = np.arange(0, arr.shape[0])
+    f = interp1d(x, arr, kind='linear', axis=0, fill_value='extrapolate')
+    scale_x = np.arange(0, arr.shape[0], 1 / scale)
+    up_scale = f(scale_x)
+    return up_scale
+# a = [0.1,0.2,0.3,0.8,0.4,0.1,0.3,0.9,0.4]
+# a = np.array(a)
+# b = a>0.2
+# _double_threshold(a,0.7,0.2)
\ No newline at end of file
diff --git a/download.sh b/download.sh
index e6be9cc..4bea05f 100644
--- a/download.sh
+++ b/download.sh
@@ -31,4 +31,17 @@ wget -P text_to_speech/checkpoints/ljspeech/ps_adv_baseline -i https://huggingfa
 # Audio to text
 wget -P audio_to_text/audiocaps_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
 wget -P audio_to_text/clotho_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
-wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
\ No newline at end of file
+wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
+# audio detection
+cd audio_detection/audio_infer/useful_ckpts
+wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/audio_detection.pth
+cd mono2binaural/useful_ckpts
+wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/m2b.tar.gz
+tar -zxvf m2b.tar.gz ./
+rm m2b.tar.gz
+cd audio_detection/target_sound_detection/useful_ckpts
+wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/tsd.tar.gz
+tar -zxvf tsd.tar.gz ./
+rm tsd.tar.gz
+cd sound_extraction/useful_ckpts
+wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/LASSNet.pt
\ No newline at end of file
diff --git a/mono2binaural/src/models.py b/mono2binaural/src/models.py
new file mode 100644
index 0000000..0d40527
--- /dev/null
+++ b/mono2binaural/src/models.py
@@ -0,0 +1,110 @@
+import numpy as np
+import scipy.linalg
+from scipy.spatial.transform import Rotation as R
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from src.warping import GeometricTimeWarper, MonotoneTimeWarper
+from src.utils import Net
+
+
+class GeometricWarper(nn.Module):
+    def __init__(self, sampling_rate=48000):
+        super().__init__()
+        self.warper = GeometricTimeWarper(sampling_rate=sampling_rate)
+
+    def _transmitter_mouth(self, view):
+        # offset between tracking markers and real mouth position in the dataset
+        mouth_offset = np.array([0.09, 0, -0.20])
+        quat = view[:, 3:, :].transpose(2, 1).contiguous().detach().cpu().view(-1, 4).numpy()
+        # make sure zero-padded values are set to non-zero values (else scipy raises an exception)
+        norms = scipy.linalg.norm(quat, axis=1)
+        eps_val = (norms == 0).astype(np.float32)
+        quat = quat + eps_val[:, None]
+        transmitter_rot_mat = R.from_quat(quat)
+        transmitter_mouth = transmitter_rot_mat.apply(mouth_offset, inverse=True)
+        transmitter_mouth = th.Tensor(transmitter_mouth).view(view.shape[0], -1, 3).transpose(2, 1).contiguous()
+        if view.is_cuda:
+            transmitter_mouth = transmitter_mouth.cuda()
+        return transmitter_mouth
+
+    def _3d_displacements(self, view):
+        transmitter_mouth = self._transmitter_mouth(view)
+        # offset between tracking markers and ears in the dataset
+        left_ear_offset = th.Tensor([0, -0.08, -0.22]).cuda() if view.is_cuda else th.Tensor([0, -0.08, -0.22])
+        right_ear_offset = th.Tensor([0, 0.08, -0.22]).cuda() if view.is_cuda else th.Tensor([0, 0.08, -0.22])
+        # compute displacements between transmitter mouth and receiver left/right ear
+        displacement_left = view[:, 0:3, :] + transmitter_mouth - left_ear_offset[None, :, None]
+        displacement_right = view[:, 0:3, :] + transmitter_mouth - right_ear_offset[None, :, None]
+        displacement = th.stack([displacement_left, displacement_right], dim=1)
+        return displacement
+
+    def _warpfield(self, view, seq_length):
+        return self.warper.displacements2warpfield(self._3d_displacements(view), seq_length)
+
+    def forward(self, mono, view):
+        '''
+        :param mono: input signal as tensor of shape B x 1 x T
+        :param view: rx/tx position/orientation as tensor of shape B x 7 x K (K = T / 400)
+        :return: warped: warped left/right ear signal as tensor of shape B x 2 x T
+        '''
+        return self.warper(th.cat([mono, mono], dim=1), self._3d_displacements(view))
+
+
+class Warpnet(nn.Module):
+    def __init__(self, layers=4, channels=64, view_dim=7):
+        super().__init__()
+        self.layers = [nn.Conv1d(view_dim if l == 0 else channels, channels, kernel_size=2) for l in range(layers)]
+        self.layers = nn.ModuleList(self.layers)
+        self.linear = nn.Conv1d(channels, 2, kernel_size=1)
+        self.neural_warper = MonotoneTimeWarper()
+        self.geometric_warper = GeometricWarper()
+
+    def neural_warpfield(self, view, seq_length):
+        warpfield = view
+        for layer in self.layers:
+            warpfield = F.pad(warpfield, pad=[1, 0])
+            warpfield = F.relu(layer(warpfield))
+        warpfield = self.linear(warpfield)
+        warpfield = F.interpolate(warpfield, size=seq_length)
+        return warpfield
+
+    def forward(self, mono, view):
+        '''
+        :param mono: input signal as tensor of shape B x 1 x T
+        :param view: rx/tx position/orientation as tensor of shape B x 7 x K (K = T / 400)
+        :return: warped: warped left/right ear signal as tensor of shape B x 2 x T
+        '''
+        geometric_warpfield = self.geometric_warper._warpfield(view, mono.shape[-1])
+        neural_warpfield = self.neural_warpfield(view, mono.shape[-1])
+        warpfield = geometric_warpfield + neural_warpfield
+        # ensure causality
+        warpfield = -F.relu(-warpfield) # the predicted warp
+        warped = self.neural_warper(th.cat([mono, mono], dim=1), warpfield)
+        return warped
+
+class BinauralNetwork(Net):
+    def __init__(self,
+                 view_dim=7,
+                 warpnet_layers=4,
+                 warpnet_channels=64,
+                 model_name='binaural_network',
+                 use_cuda=True):
+        super().__init__(model_name, use_cuda)
+        self.warper = Warpnet(warpnet_layers, warpnet_channels)
+        if self.use_cuda:
+            self.cuda()
+
+    def forward(self, mono, view):
+        '''
+        :param mono: the input signal as a B x 1 x T tensor
+        :param view: the receiver/transmitter position as a B x 7 x T tensor
+        :return: out: the binaural output produced by the network
+                 intermediate: a two-channel audio signal obtained from the output of each intermediate layer
+                               as a list of B x 2 x T tensors
+        '''
+        # print('mono ', mono.shape)
+        # print('view ', view.shape)
+        warped = self.warper(mono, view)
+        # print('warped ', warped.shape)
+        return warped
diff --git a/mono2binaural/src/utils.py b/mono2binaural/src/utils.py
new file mode 100644
index 0000000..074dd84
--- /dev/null
+++ b/mono2binaural/src/utils.py
@@ -0,0 +1,251 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+All rights reserved.
+
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+
+import numpy as np
+import torch as th
+#import torchaudio as ta
+
+
+class Net(th.nn.Module):
+
+    def __init__(self, model_name="network", use_cuda=True):
+        super().__init__()
+        self.use_cuda = use_cuda
+        self.model_name = model_name
+
+    def save(self, model_dir, suffix=''):
+        '''
+        save the network to model_dir/model_name.suffix.net
+        :param model_dir: directory to save the model to
+        :param suffix: suffix to append after model name
+        '''
+        if self.use_cuda:
+            self.cpu()
+
+        if suffix == "":
+            fname = f"{model_dir}/{self.model_name}.net"
+        else:
+            fname = f"{model_dir}/{self.model_name}.{suffix}.net"
+
+        th.save(self.state_dict(), fname)
+        if self.use_cuda:
+            self.cuda()
+
+    def load_from_file(self, model_file):
+        '''
+        load network parameters from model_file
+        :param model_file: file containing the model parameters
+        '''
+        if self.use_cuda:
+            self.cpu()
+
+        states = th.load(model_file)
+        self.load_state_dict(states)
+
+        if self.use_cuda:
+            self.cuda()
+        print(f"Loaded: {model_file}")
+
+    def load(self, model_dir, suffix=''):
+        '''
+        load network parameters from model_dir/model_name.suffix.net
+        :param model_dir: directory to load the model from
+        :param suffix: suffix to append after model name
+        '''
+        if suffix == "":
+            fname = f"{model_dir}/{self.model_name}.net"
+        else:
+            fname = f"{model_dir}/{self.model_name}.{suffix}.net"
+        self.load_from_file(fname)
+
+    def num_trainable_parameters(self):
+        '''
+        :return: the number of trainable parameters in the model
+        '''
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+# class NewbobAdam(th.optim.Adam):
+
+#     def __init__(self,
+#                  weights,
+#                  net,
+#                  artifacts_dir,
+#                  initial_learning_rate=0.001,
+#                  decay=0.5,
+#                  max_decay=0.01
+#                  ):
+#         '''
+#         Newbob learning rate scheduler
+#         :param weights: weights to optimize
+#         :param net: the network, must be an instance of type src.utils.Net
+#         :param artifacts_dir: (str) directory to save/restore models to/from
+#         :param initial_learning_rate: (float) initial learning rate
+#         :param decay: (float) value to decrease learning rate by when loss doesn't improve further
+#         :param max_decay: (float) maximum decay of learning rate
+#         '''
+#         super().__init__(weights, lr=initial_learning_rate)
+#         self.last_epoch_loss = np.inf
+#         self.total_decay = 1
+#         self.net = net
+#         self.decay = decay
+#         self.max_decay = max_decay
+#         self.artifacts_dir = artifacts_dir
+#         # store initial state as backup
+#         if decay < 1.0:
+#             net.save(artifacts_dir, suffix="newbob")
+
+#     def update_lr(self, loss):
+#         '''
+#         update the learning rate based on the current loss value and historic loss values
+#         :param loss: the loss after the current iteration
+#         '''
+#         if loss > self.last_epoch_loss and self.decay < 1.0 and self.total_decay > self.max_decay:
+#             self.total_decay = self.total_decay * self.decay
+#             print(f"NewbobAdam: Decay learning rate (loss degraded from {self.last_epoch_loss} to {loss})."
+#                   f"Total decay: {self.total_decay}")
+#             # restore previous network state
+#             self.net.load(self.artifacts_dir, suffix="newbob")
+#             # decrease learning rate
+#             for param_group in self.param_groups:
+#                 param_group['lr'] = param_group['lr'] * self.decay
+#         else:
+#             self.last_epoch_loss = loss
+#         # save last snapshot to restore it in case of lr decrease
+#         if self.decay < 1.0 and self.total_decay > self.max_decay:
+#             self.net.save(self.artifacts_dir, suffix="newbob")
+
+
+# class FourierTransform:
+#     def __init__(self,
+#                  fft_bins=2048,
+#                  win_length_ms=40,
+#                  frame_rate_hz=100,
+#                  causal=False,
+#                  preemphasis=0.0,
+#                  sample_rate=48000,
+#                  normalized=False):
+#         self.sample_rate = sample_rate
+#         self.frame_rate_hz = frame_rate_hz
+#         self.preemphasis = preemphasis
+#         self.fft_bins = fft_bins
+#         self.win_length = int(sample_rate * win_length_ms / 1000)
+#         self.hop_length = int(sample_rate / frame_rate_hz)
+#         self.causal = causal
+#         self.normalized = normalized
+#         if self.win_length > self.fft_bins:
+#             print('FourierTransform Warning: fft_bins should be larger than win_length')
+
+#     def _convert_format(self, data, expected_dims):
+#         if not type(data) == th.Tensor:
+#             data = th.Tensor(data)
+#         if len(data.shape) < expected_dims:
+#             data = data.unsqueeze(0)
+#         if not len(data.shape) == expected_dims:
+#             raise Exception(f"FourierTransform: data needs to be a Tensor with {expected_dims} dimensions but got shape {data.shape}")
+#         return data
+
+#     def _preemphasis(self, audio):
+#         if self.preemphasis > 0:
+#             return th.cat((audio[:, 0:1], audio[:, 1:] - self.preemphasis * audio[:, :-1]), dim=1)
+#         return audio
+
+#     def _revert_preemphasis(self, audio):
+#         if self.preemphasis > 0:
+#             for i in range(1, audio.shape[1]):
+#                 audio[:, i] = audio[:, i] + self.preemphasis * audio[:, i-1]
+#         return audio
+
+#     def _magphase(self, complex_stft):
+#         mag, phase = ta.functional.magphase(complex_stft, 1.0)
+#         return mag, phase
+
+#     def stft(self, audio):
+#         '''
+#         wrapper around th.stft
+#         audio: wave signal as th.Tensor
+#         '''
+#         hann = th.hann_window(self.win_length)
+#         hann = hann.cuda() if audio.is_cuda else hann
+#         spec = th.stft(audio, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length,
+#                        window=hann, center=not self.causal, normalized=self.normalized)
+#         return spec.contiguous()
+
+#     def complex_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: th.Tensor of size channels x frequencies x time_steps (channels x y_axis x x_axis)
+#         '''
+#         self._convert_format(audio, expected_dims=2)
+#         audio = self._preemphasis(audio)
+#         return self.stft(audio)
+
+#     def magnitude_phase(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: tuple containing two th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         stft = self.complex_spectrogram(audio)
+#         return self._magphase(stft)
+
+#     def mag_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: magnitude spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         return self.magnitude_phase(audio)[0]
+
+#     def power_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: power spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         return th.pow(self.mag_spectrogram(audio), 2.0)
+
+#     def phase_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: phase spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         return self.magnitude_phase(audio)[1]
+
+#     def mel_spectrogram(self, audio, n_mels):
+#         '''
+#         audio: wave signal as th.Tensor
+#         n_mels: number of bins used for mel scale warping
+#         return: mel spectrogram as th.Tensor of size channels x n_mels x time_steps for magnitude and phase spectrum
+#         '''
+#         spec = self.power_spectrogram(audio)
+#         mel_warping = ta.transforms.MelScale(n_mels, self.sample_rate)
+#         return mel_warping(spec)
+
+#     def complex_spec2wav(self, complex_spec, length):
+#         '''
+#         inverse stft
+#         complex_spec: complex spectrum as th.Tensor of size channels x frequencies x time_steps x 2 (real part/imaginary part)
+#         length: length of the audio to be reconstructed (in frames)
+#         '''
+#         complex_spec = self._convert_format(complex_spec, expected_dims=4)
+#         hann = th.hann_window(self.win_length)
+#         hann = hann.cuda() if complex_spec.is_cuda else hann
+#         wav = ta.functional.istft(complex_spec, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length, window=hann, length=length, center=not self.causal)
+#         wav = self._revert_preemphasis(wav)
+#         return wav
+
+#     def magphase2wav(self, mag_spec, phase_spec, length):
+#         '''
+#         reconstruction of wav signal from magnitude and phase spectrum
+#         mag_spec: magnitude spectrum as th.Tensor of size channels x frequencies x time_steps
+#         phase_spec: phase spectrum as th.Tensor of size channels x frequencies x time_steps
+#         length: length of the audio to be reconstructed (in frames)
+#         '''
+#         mag_spec = self._convert_format(mag_spec, expected_dims=3)
+#         phase_spec = self._convert_format(phase_spec, expected_dims=3)
+#         complex_spec = th.stack([mag_spec * th.cos(phase_spec), mag_spec * th.sin(phase_spec)], dim=-1)
+#         return self.complex_spec2wav(complex_spec, length)
+
diff --git a/mono2binaural/src/warping.py b/mono2binaural/src/warping.py
new file mode 100644
index 0000000..9d7c4ed
--- /dev/null
+++ b/mono2binaural/src/warping.py
@@ -0,0 +1,113 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+All rights reserved.
+
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class TimeWarperFunction(th.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, input, warpfield):
+        '''
+        :param ctx: autograd context
+        :param input: input signal (B x 2 x T)
+        :param warpfield: the corresponding warpfield (B x 2 x T)
+        :return: the warped signal (B x 2 x T)
+        '''
+        ctx.save_for_backward(input, warpfield)
+        # compute index list to lookup warped input values
+        idx_left = warpfield.floor().type(th.long)
+        idx_right = th.clamp(warpfield.ceil().type(th.long), max=input.shape[-1]-1)
+        # compute weight for linear interpolation
+        alpha = warpfield - warpfield.floor()
+        # linear interpolation
+        output = (1 - alpha) * th.gather(input, 2, idx_left) + alpha * th.gather(input, 2, idx_right)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, warpfield = ctx.saved_tensors
+        # compute index list to lookup warped input values
+        idx_left = warpfield.floor().type(th.long)
+        idx_right = th.clamp(warpfield.ceil().type(th.long), max=input.shape[-1]-1)
+        # warpfield gradient
+        grad_warpfield = th.gather(input, 2, idx_right) - th.gather(input, 2, idx_left)
+        grad_warpfield = grad_output * grad_warpfield
+        # input gradient
+        grad_input = th.zeros(input.shape, device=input.device)
+        alpha = warpfield - warpfield.floor()
+        grad_input = grad_input.scatter_add(2, idx_left, grad_output * (1 - alpha)) + \
+                     grad_input.scatter_add(2, idx_right, grad_output * alpha)
+        return grad_input, grad_warpfield
+
+
+class TimeWarper(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.warper = TimeWarperFunction().apply
+
+    def _to_absolute_positions(self, warpfield, seq_length):
+        # translate warpfield from relative warp indices to absolute indices ([1...T] + warpfield)
+        temp_range = th.arange(seq_length, dtype=th.float)
+        temp_range = temp_range.cuda() if warpfield.is_cuda else temp_range
+        return th.clamp(warpfield + temp_range[None, None, :], min=0, max=seq_length-1)
+
+    def forward(self, input, warpfield):
+        '''
+        :param input: audio signal to be warped (B x 2 x T)
+        :param warpfield: the corresponding warpfield (B x 2 x T)
+        :return: the warped signal (B x 2 x T)
+        '''
+        warpfield = self._to_absolute_positions(warpfield, input.shape[-1])
+        warped = self.warper(input, warpfield)
+        return warped
+
+
+class MonotoneTimeWarper(TimeWarper):
+
+    def forward(self, input, warpfield):
+        '''
+        :param input: audio signal to be warped (B x 2 x T)
+        :param warpfield: the corresponding warpfield (B x 2 x T)
+        :return: the warped signal (B x 2 x T), ensured to be monotonous
+        '''
+        warpfield = self._to_absolute_positions(warpfield, input.shape[-1])
+        # ensure monotonicity: each warp must be at least as big as previous_warp-1
+        warpfield = th.cummax(warpfield, dim=-1)[0]
+        # print('warpfield ',warpfield.shape)
+        # warp
+        warped = self.warper(input, warpfield)
+        return warped
+
+
+class GeometricTimeWarper(TimeWarper):
+
+    def __init__(self, sampling_rate=48000):
+        super().__init__()
+        self.sampling_rate = sampling_rate
+
+    def displacements2warpfield(self, displacements, seq_length):
+        distance = th.sum(displacements**2, dim=2) ** 0.5
+        distance = F.interpolate(distance, size=seq_length)
+        warpfield = -distance / 343.0 * self.sampling_rate
+        return warpfield
+
+    def forward(self, input, displacements):
+        '''
+        :param input: audio signal to be warped (B x 2 x T)
+        :param displacements: sequence of 3D displacement vectors for geometric warping (B x 3 x T)
+        :return: the warped signal (B x 2 x T)
+        '''
+        warpfield = self.displacements2warpfield(displacements, input.shape[-1])
+        # print('Ge warpfield ', warpfield.shape)
+        # assert 1==2
+        warped = super().forward(input, warpfield)
+        return warped
diff --git a/sound_extraction/model/LASSNet.py b/sound_extraction/model/LASSNet.py
new file mode 100644
index 0000000..a525e3c
--- /dev/null
+++ b/sound_extraction/model/LASSNet.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .text_encoder import Text_Encoder
+from .resunet_film import UNetRes_FiLM
+
+class LASSNet(nn.Module):
+    def __init__(self, device='cuda'):
+        super(LASSNet, self).__init__()
+        self.text_embedder = Text_Encoder(device)
+        self.UNet = UNetRes_FiLM(channels=1, cond_embedding_dim=256)
+
+    def forward(self, x, caption):
+        # x: (Batch, 1, T, 128))
+        input_ids, attns_mask = self.text_embedder.tokenize(caption)
+        
+        cond_vec = self.text_embedder(input_ids, attns_mask)[0]
+        dec_cond_vec = cond_vec
+
+        mask = self.UNet(x, cond_vec, dec_cond_vec)
+        mask = torch.sigmoid(mask)
+        return mask
+
+    def get_tokenizer(self):
+        return self.text_embedder.tokenizer
diff --git a/sound_extraction/model/film.py b/sound_extraction/model/film.py
new file mode 100644
index 0000000..e4e9a9d
--- /dev/null
+++ b/sound_extraction/model/film.py
@@ -0,0 +1,27 @@
+import torch
+import torch.nn as nn
+
+class Film(nn.Module):
+    def __init__(self, channels, cond_embedding_dim):
+        super(Film, self).__init__()
+        self.linear = nn.Sequential(
+            nn.Linear(cond_embedding_dim, channels * 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(channels * 2, channels),
+            nn.ReLU(inplace=True)
+        )
+
+    def forward(self, data, cond_vec):
+        """
+        :param data: [batchsize, channels, samples] or [batchsize, channels, T, F] or [batchsize, channels, F, T]
+        :param cond_vec: [batchsize, cond_embedding_dim]
+        :return:
+        """
+        bias = self.linear(cond_vec)  # [batchsize, channels]
+        if len(list(data.size())) == 3:
+            data = data + bias[..., None]
+        elif len(list(data.size())) == 4:
+            data = data + bias[..., None, None]
+        else:
+            print("Warning: The size of input tensor,", data.size(), "is not correct. Film is not working.")
+        return data
\ No newline at end of file
diff --git a/sound_extraction/model/modules.py b/sound_extraction/model/modules.py
new file mode 100644
index 0000000..1124b1a
--- /dev/null
+++ b/sound_extraction/model/modules.py
@@ -0,0 +1,483 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from .film import Film
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, activation, momentum):
+        super(ConvBlock, self).__init__()
+
+        self.activation = activation
+        padding = (kernel_size[0] // 2, kernel_size[1] // 2)
+
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=(1, 1),
+            dilation=(1, 1),
+            padding=padding,
+            bias=False,
+        )
+
+        self.bn1 = nn.BatchNorm2d(out_channels, momentum=momentum)
+
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=(1, 1),
+            dilation=(1, 1),
+            padding=padding,
+            bias=False,
+        )
+
+        self.bn2 = nn.BatchNorm2d(out_channels, momentum=momentum)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+
+    def forward(self, x):
+        x = act(self.bn1(self.conv1(x)), self.activation)
+        x = act(self.bn2(self.conv2(x)), self.activation)
+        return x
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, downsample, activation, momentum):
+        super(EncoderBlock, self).__init__()
+
+        self.conv_block = ConvBlock(
+            in_channels, out_channels, kernel_size, activation, momentum
+        )
+        self.downsample = downsample
+
+    def forward(self, x):
+        encoder = self.conv_block(x)
+        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
+        return encoder_pool, encoder
+
+
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, upsample, activation, momentum):
+        super(DecoderBlock, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = upsample
+        self.activation = activation
+
+        self.conv1 = torch.nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=self.stride,
+            stride=self.stride,
+            padding=(0, 0),
+            bias=False,
+            dilation=(1, 1),
+        )
+
+        self.bn1 = nn.BatchNorm2d(out_channels, momentum=momentum)
+
+        self.conv_block2 = ConvBlock(
+            out_channels * 2, out_channels, kernel_size, activation, momentum
+        )
+
+    def init_weights(self):
+        init_layer(self.conv1)
+        init_bn(self.bn)
+
+    def prune(self, x):
+        """Prune the shape of x after transpose convolution."""
+        padding = (self.kernel_size[0] // 2, self.kernel_size[1] // 2)
+        x = x[
+            :,
+            :,
+            padding[0] : padding[0] - self.stride[0],
+            padding[1] : padding[1] - self.stride[1]]
+        return x
+
+    def forward(self, input_tensor, concat_tensor):
+        x = act(self.bn1(self.conv1(input_tensor)), self.activation)
+        # from IPython import embed; embed(using=False); os._exit(0)
+        # x = self.prune(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x)
+        return x
+
+
+class EncoderBlockRes1B(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum):
+        super(EncoderBlockRes1B, self).__init__()
+        size = (3,3)
+
+        self.conv_block1 = ConvBlockRes(in_channels, out_channels, size, activation, momentum)
+        self.conv_block2 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.downsample = downsample
+
+    def forward(self, x):
+        encoder = self.conv_block1(x)
+        encoder = self.conv_block2(encoder)
+        encoder = self.conv_block3(encoder)
+        encoder = self.conv_block4(encoder)
+        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
+        return encoder_pool, encoder
+
+class DecoderBlockRes1B(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum):
+        super(DecoderBlockRes1B, self).__init__()
+        size = (3,3)
+        self.activation = activation
+
+        self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
+            out_channels=out_channels, kernel_size=size, stride=stride,
+            padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
+
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv_block2 = ConvBlockRes(out_channels * 2, out_channels, size, activation, momentum)
+        self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block5 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+
+    def init_weights(self):
+        init_layer(self.conv1)
+
+    def prune(self, x, both=False):
+        """Prune the shape of x after transpose convolution.
+        """
+        if(both): x = x[:, :, 0 : - 1, 0:-1]
+        else: x = x[:, :, 0: - 1, :]
+        return x
+
+    def forward(self, input_tensor, concat_tensor,both=False):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        x = self.prune(x,both=both)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x)
+        x = self.conv_block3(x)
+        x = self.conv_block4(x)
+        x = self.conv_block5(x)
+        return x
+
+
+class EncoderBlockRes2BCond(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum, cond_embedding_dim):
+        super(EncoderBlockRes2BCond, self).__init__()
+        size = (3, 3)
+
+        self.conv_block1 = ConvBlockResCond(in_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block2 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.downsample = downsample
+
+    def forward(self, x, cond_vec):
+        encoder = self.conv_block1(x, cond_vec)
+        encoder = self.conv_block2(encoder, cond_vec)
+        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
+        return encoder_pool, encoder
+
+class DecoderBlockRes2BCond(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum, cond_embedding_dim):
+        super(DecoderBlockRes2BCond, self).__init__()
+        size = (3, 3)
+        self.activation = activation
+
+        self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
+            out_channels=out_channels, kernel_size=size, stride=stride,
+            padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
+
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv_block2 = ConvBlockResCond(out_channels * 2, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block3 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+
+    def init_weights(self):
+        init_layer(self.conv1)
+
+    def prune(self, x, both=False):
+        """Prune the shape of x after transpose convolution.
+        """
+        if(both): x = x[:, :, 0 : - 1, 0:-1]
+        else: x = x[:, :, 0: - 1, :]
+        return x
+
+    def forward(self, input_tensor, concat_tensor, cond_vec, both=False):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        x = self.prune(x, both=both)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x, cond_vec)
+        x = self.conv_block3(x, cond_vec)
+        return x
+
+class EncoderBlockRes4BCond(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum, cond_embedding_dim):
+        super(EncoderBlockRes4B, self).__init__()
+        size = (3,3)
+
+        self.conv_block1 = ConvBlockResCond(in_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block2 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block3 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block4 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.downsample = downsample
+
+    def forward(self, x, cond_vec):
+        encoder = self.conv_block1(x, cond_vec)
+        encoder = self.conv_block2(encoder, cond_vec)
+        encoder = self.conv_block3(encoder, cond_vec)
+        encoder = self.conv_block4(encoder, cond_vec)
+        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
+        return encoder_pool, encoder
+
+class DecoderBlockRes4BCond(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum, cond_embedding_dim):
+        super(DecoderBlockRes4B, self).__init__()
+        size = (3, 3)
+        self.activation = activation
+
+        self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
+            out_channels=out_channels, kernel_size=size, stride=stride,
+            padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
+
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv_block2 = ConvBlockResCond(out_channels * 2, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block3 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block4 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+        self.conv_block5 = ConvBlockResCond(out_channels, out_channels, size, activation, momentum, cond_embedding_dim)
+
+    def init_weights(self):
+        init_layer(self.conv1)
+
+    def prune(self, x, both=False):
+        """Prune the shape of x after transpose convolution.
+        """
+        if(both): x = x[:, :, 0 : - 1, 0:-1]
+        else: x = x[:, :, 0: - 1, :]
+        return x
+
+    def forward(self, input_tensor, concat_tensor, cond_vec, both=False):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        x = self.prune(x,both=both)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x, cond_vec)
+        x = self.conv_block3(x, cond_vec)
+        x = self.conv_block4(x, cond_vec)
+        x = self.conv_block5(x, cond_vec)
+        return x
+
+class EncoderBlockRes4B(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum):
+        super(EncoderBlockRes4B, self).__init__()
+        size = (3, 3)
+
+        self.conv_block1 = ConvBlockRes(in_channels, out_channels, size, activation, momentum)
+        self.conv_block2 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.downsample = downsample
+
+    def forward(self, x):
+        encoder = self.conv_block1(x)
+        encoder = self.conv_block2(encoder)
+        encoder = self.conv_block3(encoder)
+        encoder = self.conv_block4(encoder)
+        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
+        return encoder_pool, encoder
+
+class DecoderBlockRes4B(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum):
+        super(DecoderBlockRes4B, self).__init__()
+        size = (3,3)
+        self.activation = activation
+
+        self.conv1 = torch.nn.ConvTranspose2d(in_channels=in_channels,
+            out_channels=out_channels, kernel_size=size, stride=stride,
+            padding=(0, 0), output_padding=(0, 0), bias=False, dilation=1)
+
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv_block2 = ConvBlockRes(out_channels * 2, out_channels, size, activation, momentum)
+        self.conv_block3 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block4 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+        self.conv_block5 = ConvBlockRes(out_channels, out_channels, size, activation, momentum)
+
+    def init_weights(self):
+        init_layer(self.conv1)
+
+    def prune(self, x, both=False):
+        """Prune the shape of x after transpose convolution.
+        """
+        if(both): x = x[:, :, 0 : - 1, 0:-1]
+        else: x = x[:, :, 0: - 1, :]
+        return x
+
+    def forward(self, input_tensor, concat_tensor,both=False):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        x = self.prune(x,both=both)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x)
+        x = self.conv_block3(x)
+        x = self.conv_block4(x)
+        x = self.conv_block5(x)
+        return x
+
+class ConvBlockResCond(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, activation, momentum, cond_embedding_dim):
+        r"""Residual block.
+        """
+        super(ConvBlockResCond, self).__init__()
+
+        self.activation = activation
+        padding = [kernel_size[0] // 2, kernel_size[1] // 2]
+
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              dilation=(1, 1), padding=padding, bias=False)
+        self.film1 = Film(channels=out_channels, cond_embedding_dim=cond_embedding_dim)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              dilation=(1, 1), padding=padding, bias=False)
+        self.film2 = Film(channels=out_channels, cond_embedding_dim=cond_embedding_dim)
+
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels=in_channels,
+                out_channels=out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
+            self.film_res = Film(channels=out_channels, cond_embedding_dim=cond_embedding_dim)
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+
+        if self.is_shortcut:
+            init_layer(self.shortcut)
+
+    def forward(self, x, cond_vec):
+        origin = x
+        x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
+        x = self.film1(x, cond_vec)
+        x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
+        x = self.film2(x, cond_vec)
+        if self.is_shortcut:
+            residual = self.shortcut(origin)
+            residual = self.film_res(residual, cond_vec)
+            return residual + x
+        else:
+            return origin + x
+
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, activation, momentum):
+        r"""Residual block.
+        """
+        super(ConvBlockRes, self).__init__()
+
+        self.activation = activation
+        padding = [kernel_size[0] // 2, kernel_size[1] // 2]
+
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              dilation=(1, 1), padding=padding, bias=False)
+
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=kernel_size, stride=(1, 1),
+                              dilation=(1, 1), padding=padding, bias=False)
+
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels=in_channels,
+                out_channels=out_channels, kernel_size=(1, 1), stride=(1, 1), padding=(0, 0))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+
+        if self.is_shortcut:
+            init_layer(self.shortcut)
+
+    def forward(self, x):
+        origin = x
+        x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
+        x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
+
+        if self.is_shortcut:
+            return self.shortcut(origin) + x
+        else:
+            return origin + x
+
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+
+def init_gru(rnn):
+    """Initialize a GRU layer. """
+
+    def _concat_init(tensor, init_funcs):
+        (length, fan_out) = tensor.shape
+        fan_in = length // len(init_funcs)
+
+        for (i, init_func) in enumerate(init_funcs):
+            init_func(tensor[i * fan_in: (i + 1) * fan_in, :])
+
+    def _inner_uniform(tensor):
+        fan_in = nn.init._calculate_correct_fan(tensor, 'fan_in')
+        nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in))
+
+    for i in range(rnn.num_layers):
+        _concat_init(
+            getattr(rnn, 'weight_ih_l{}'.format(i)),
+            [_inner_uniform, _inner_uniform, _inner_uniform]
+        )
+        torch.nn.init.constant_(getattr(rnn, 'bias_ih_l{}'.format(i)), 0)
+
+        _concat_init(
+            getattr(rnn, 'weight_hh_l{}'.format(i)),
+            [_inner_uniform, _inner_uniform, nn.init.orthogonal_]
+        )
+        torch.nn.init.constant_(getattr(rnn, 'bias_hh_l{}'.format(i)), 0)
+
+
+def act(x, activation):
+    if activation == 'relu':
+        return F.relu_(x)
+
+    elif activation == 'leaky_relu':
+        return F.leaky_relu_(x, negative_slope=0.2)
+
+    elif activation == 'swish':
+        return x * torch.sigmoid(x)
+
+    else:
+        raise Exception('Incorrect activation!')
\ No newline at end of file
diff --git a/sound_extraction/model/resunet_film.py b/sound_extraction/model/resunet_film.py
new file mode 100644
index 0000000..c00addc
--- /dev/null
+++ b/sound_extraction/model/resunet_film.py
@@ -0,0 +1,110 @@
+from .modules import *
+import numpy as np
+
+class UNetRes_FiLM(nn.Module):
+    def __init__(self, channels, cond_embedding_dim, nsrc=1):
+        super(UNetRes_FiLM, self).__init__()
+        activation = 'relu'
+        momentum = 0.01
+
+        self.nsrc = nsrc
+        self.channels = channels
+        self.downsample_ratio = 2 ** 6  # This number equals 2^{#encoder_blocks}
+
+        self.encoder_block1 = EncoderBlockRes2BCond(in_channels=channels * nsrc, out_channels=32,
+                                                    downsample=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.encoder_block2 = EncoderBlockRes2BCond(in_channels=32, out_channels=64,
+                                                    downsample=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.encoder_block3 = EncoderBlockRes2BCond(in_channels=64, out_channels=128,
+                                                    downsample=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.encoder_block4 = EncoderBlockRes2BCond(in_channels=128, out_channels=256,
+                                                    downsample=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.encoder_block5 = EncoderBlockRes2BCond(in_channels=256, out_channels=384,
+                                                    downsample=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.encoder_block6 = EncoderBlockRes2BCond(in_channels=384, out_channels=384,
+                                                    downsample=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.conv_block7 = ConvBlockResCond(in_channels=384, out_channels=384,
+                                            kernel_size=(3, 3), activation=activation, momentum=momentum,
+                                            cond_embedding_dim=cond_embedding_dim)
+        self.decoder_block1 = DecoderBlockRes2BCond(in_channels=384, out_channels=384,
+                                                    stride=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.decoder_block2 = DecoderBlockRes2BCond(in_channels=384, out_channels=384,
+                                                    stride=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.decoder_block3 = DecoderBlockRes2BCond(in_channels=384, out_channels=256,
+                                                    stride=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.decoder_block4 = DecoderBlockRes2BCond(in_channels=256, out_channels=128,
+                                                    stride=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.decoder_block5 = DecoderBlockRes2BCond(in_channels=128, out_channels=64,
+                                                    stride=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+        self.decoder_block6 = DecoderBlockRes2BCond(in_channels=64, out_channels=32,
+                                                    stride=(2, 2), activation=activation, momentum=momentum,
+                                                    cond_embedding_dim=cond_embedding_dim)
+
+        self.after_conv_block1 = ConvBlockResCond(in_channels=32, out_channels=32,
+                                                  kernel_size=(3, 3), activation=activation, momentum=momentum,
+                                                  cond_embedding_dim=cond_embedding_dim)
+
+        self.after_conv2 = nn.Conv2d(in_channels=32, out_channels=1,
+                                     kernel_size=(1, 1), stride=(1, 1), padding=(0, 0), bias=True)
+
+        self.init_weights()
+
+    def init_weights(self):
+        init_layer(self.after_conv2)
+
+    def forward(self, sp, cond_vec, dec_cond_vec):
+        """
+        Args:
+          input: sp: (batch_size, channels_num, segment_samples)
+        Outputs:
+          output_dict: {
+            'wav': (batch_size, channels_num, segment_samples),
+            'sp': (batch_size, channels_num, time_steps, freq_bins)}
+        """
+
+        x = sp
+        # Pad spectrogram to be evenly divided by downsample ratio.
+        origin_len = x.shape[2]  # time_steps
+        pad_len = int(np.ceil(x.shape[2] / self.downsample_ratio)) * self.downsample_ratio - origin_len
+        x = F.pad(x, pad=(0, 0, 0, pad_len))
+        x = x[..., 0: x.shape[-1] - 2]  # (bs, channels, T, F)
+
+        # UNet
+        (x1_pool, x1) = self.encoder_block1(x, cond_vec)  # x1_pool: (bs, 32, T / 2, F / 2)
+        (x2_pool, x2) = self.encoder_block2(x1_pool, cond_vec)  # x2_pool: (bs, 64, T / 4, F / 4)
+        (x3_pool, x3) = self.encoder_block3(x2_pool, cond_vec)  # x3_pool: (bs, 128, T / 8, F / 8)
+        (x4_pool, x4) = self.encoder_block4(x3_pool, dec_cond_vec)  # x4_pool: (bs, 256, T / 16, F / 16)
+        (x5_pool, x5) = self.encoder_block5(x4_pool, dec_cond_vec)  # x5_pool: (bs, 512, T / 32, F / 32)
+        (x6_pool, x6) = self.encoder_block6(x5_pool, dec_cond_vec)  # x6_pool: (bs, 1024, T / 64, F / 64)
+        x_center = self.conv_block7(x6_pool, dec_cond_vec)  # (bs, 2048, T / 64, F / 64)
+        x7 = self.decoder_block1(x_center, x6, dec_cond_vec)  # (bs, 1024, T / 32, F / 32)
+        x8 = self.decoder_block2(x7, x5, dec_cond_vec)  # (bs, 512, T / 16, F / 16)
+        x9 = self.decoder_block3(x8, x4, cond_vec)  # (bs, 256, T / 8, F / 8)
+        x10 = self.decoder_block4(x9, x3, cond_vec)  # (bs, 128, T / 4, F / 4)
+        x11 = self.decoder_block5(x10, x2, cond_vec)  # (bs, 64, T / 2, F / 2)
+        x12 = self.decoder_block6(x11, x1, cond_vec)  # (bs, 32, T, F)
+        x = self.after_conv_block1(x12, cond_vec)  # (bs, 32, T, F)
+        x = self.after_conv2(x)  # (bs, channels, T, F)
+
+        # Recover shape
+        x = F.pad(x, pad=(0, 2))
+        x = x[:, :, 0: origin_len, :]
+        return x
+
+
+if __name__ == "__main__":
+    model = UNetRes_FiLM(channels=1, cond_embedding_dim=16)
+    cond_vec = torch.randn((1, 16))
+    dec_vec = cond_vec
+    print(model(torch.randn((1, 1, 1001, 513)), cond_vec, dec_vec).size())
diff --git a/sound_extraction/model/text_encoder.py b/sound_extraction/model/text_encoder.py
new file mode 100644
index 0000000..e785285
--- /dev/null
+++ b/sound_extraction/model/text_encoder.py
@@ -0,0 +1,45 @@
+import torch
+import torch.nn as nn
+from transformers import *
+import warnings
+warnings.filterwarnings('ignore')
+# pretrained model name: (model class, model tokenizer, output dimension, token style)
+MODELS = {
+    'prajjwal1/bert-mini': (BertModel, BertTokenizer),
+}
+
+class Text_Encoder(nn.Module):
+    def __init__(self, device):
+        super(Text_Encoder, self).__init__()
+        self.base_model = 'prajjwal1/bert-mini'
+        self.dropout = 0.1
+
+        self.tokenizer = MODELS[self.base_model][1].from_pretrained(self.base_model)
+
+        self.bert_layer =  MODELS[self.base_model][0].from_pretrained(self.base_model,
+                                                    add_pooling_layer=False,
+                                                    hidden_dropout_prob=self.dropout,
+                                                    attention_probs_dropout_prob=self.dropout,
+                                                    output_hidden_states=True)
+        
+        self.linear_layer = nn.Sequential(nn.Linear(256, 256), nn.ReLU(inplace=True))
+        
+        self.device = device
+
+    def tokenize(self, caption):
+        # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        tokenized = self.tokenizer(caption, add_special_tokens=False, padding=True, return_tensors='pt')
+        input_ids = tokenized['input_ids']
+        attns_mask = tokenized['attention_mask']
+
+        input_ids = input_ids.to(self.device)
+        attns_mask = attns_mask.to(self.device)
+        return input_ids, attns_mask
+
+    def forward(self, input_ids, attns_mask):
+        # input_ids, attns_mask = self.tokenize(caption)
+        output = self.bert_layer(input_ids=input_ids, attention_mask=attns_mask)[0]
+        cls_embed = output[:, 0, :]
+        text_embed = self.linear_layer(cls_embed)
+
+        return text_embed, output  # text_embed: (batch, hidden_size)
\ No newline at end of file
diff --git a/sound_extraction/utils/create_mixtures.py b/sound_extraction/utils/create_mixtures.py
new file mode 100644
index 0000000..2b30d0d
--- /dev/null
+++ b/sound_extraction/utils/create_mixtures.py
@@ -0,0 +1,98 @@
+import torch
+import numpy as np
+
+def add_noise_and_scale(front, noise, snr_l=0, snr_h=0, scale_lower=1.0, scale_upper=1.0):
+    """
+    :param front: front-head audio, like vocal [samples,channel], will be normlized so any scale will be fine
+    :param noise: noise, [samples,channel], any scale
+    :param snr_l: Optional
+    :param snr_h: Optional
+    :param scale_lower: Optional
+    :param scale_upper: Optional
+    :return: scaled front and noise (noisy = front + noise), all_mel_e2e outputs are noramlized within [-1 , 1]
+    """
+    snr = None
+    noise, front = normalize_energy_torch(noise), normalize_energy_torch(front)  # set noise and vocal to equal range [-1,1]
+    # print("normalize:",torch.max(noise),torch.max(front))
+    if snr_l is not None and snr_h is not None:
+        front, noise, snr = _random_noise(front, noise, snr_l=snr_l, snr_h=snr_h)  # remix them with a specific snr
+
+    noisy, noise, front = unify_energy_torch(noise + front, noise, front)   # normalize noisy, noise and vocal energy into [-1,1]
+  
+    # print("unify:", torch.max(noise), torch.max(front), torch.max(noisy))
+    scale = _random_scale(scale_lower, scale_upper) # random scale these three signal
+     
+    # print("Scale",scale)
+    noisy, noise, front = noisy * scale, noise * scale, front * scale  # apply scale
+    # print("after scale", torch.max(noisy), torch.max(noise), torch.max(front), snr, scale)
+    
+    front, noise = _to_numpy(front), _to_numpy(noise) # [num_samples]
+    mixed_wav = front + noise
+    
+    return front, noise, mixed_wav, snr, scale
+
+def _random_scale(lower=0.3, upper=0.9):
+    return float(uniform_torch(lower, upper))
+
+def _random_noise(clean, noise, snr_l=None, snr_h=None):
+    snr = uniform_torch(snr_l,snr_h)
+    clean_weight = 10 ** (float(snr) / 20)
+    return clean, noise/clean_weight, snr
+    
+def _to_numpy(wav):
+    return np.transpose(wav, (1, 0))[0].numpy()  # [num_samples]
+
+def normalize_energy(audio, alpha = 1):
+    '''
+    :param audio: 1d waveform, [batchsize, *],
+    :param alpha: the value of output range from: [-alpha,alpha]
+    :return: 1d waveform which value range from: [-alpha,alpha]
+    '''
+    val_max = activelev(audio)
+    return (audio / val_max) * alpha
+
+def normalize_energy_torch(audio, alpha = 1):
+    '''
+    If the signal is almost empty(determined by threshold), if will only be divided by 2**15
+    :param audio: 1d waveform, 2**15
+    :param alpha: the value of output range from: [-alpha,alpha]
+    :return: 1d waveform which value range from: [-alpha,alpha]
+    '''
+    val_max = activelev_torch([audio])
+    return (audio / val_max) * alpha
+
+def unify_energy(*args):
+    max_amp = activelev(args)
+    mix_scale = 1.0/max_amp
+    return [x * mix_scale for x in args]
+
+def unify_energy_torch(*args):
+    max_amp = activelev_torch(args)
+    mix_scale = 1.0/max_amp
+    return [x * mix_scale for x in args]
+
+def activelev(*args):
+    '''
+        need to update like matlab
+    '''
+    return np.max(np.abs([*args]))
+
+def activelev_torch(*args):
+    '''
+        need to update like matlab
+    '''
+    res = []
+    args = args[0]
+    for each in args:
+        res.append(torch.max(torch.abs(each)))
+    return max(res)
+
+def uniform_torch(lower, upper):
+    if(abs(lower-upper)<1e-5):
+        return upper
+    return (upper-lower)*torch.rand(1)+lower
+
+if __name__ == "__main__":
+    wav1 = torch.randn(1, 32000)
+    wav2 = torch.randn(1, 32000)
+    target, noise, snr, scale = add_noise_and_scale(wav1, wav2)
diff --git a/sound_extraction/utils/stft.py b/sound_extraction/utils/stft.py
new file mode 100644
index 0000000..04a1da9
--- /dev/null
+++ b/sound_extraction/utils/stft.py
@@ -0,0 +1,159 @@
+import torch
+import numpy as np
+import torch.nn.functional as F
+from torch.autograd import Variable
+from scipy.signal import get_window
+import librosa.util as librosa_util
+from librosa.util import pad_center, tiny
+# from audio_processing import window_sumsquare
+
+def window_sumsquare(window, n_frames, hop_length=512, win_length=1024,
+                     n_fft=1024, dtype=np.float32, norm=None):
+    """
+    # from librosa 0.6
+    Compute the sum-square envelope of a window function at a given hop length.
+    This is used to estimate modulation effects induced by windowing
+    observations in short-time fourier transforms.
+    Parameters
+    ----------
+    window : string, tuple, number, callable, or list-like
+        Window specification, as in `get_window`
+    n_frames : int > 0
+        The number of analysis frames
+    hop_length : int > 0
+        The number of samples to advance between frames
+    win_length : [optional]
+        The length of the window function.  By default, this matches `n_fft`.
+    n_fft : int > 0
+        The length of each analysis frame.
+    dtype : np.dtype
+        The data type of the output
+    Returns
+    -------
+    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
+        The sum-squared envelope of the window function
+    """
+    if win_length is None:
+        win_length = n_fft
+
+    n = n_fft + hop_length * (n_frames - 1)
+    x = np.zeros(n, dtype=dtype)
+
+    # Compute the squared window at the desired length
+    win_sq = get_window(window, win_length, fftbins=True)
+    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
+    win_sq = librosa_util.pad_center(win_sq, n_fft)
+
+    # Fill the envelope
+    for i in range(n_frames):
+        sample = i * hop_length
+        x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
+    return x
+
+class STFT(torch.nn.Module):
+    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
+    def __init__(self, filter_length=1024, hop_length=512, win_length=1024,
+                 window='hann'):
+        super(STFT, self).__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.window = window
+        self.forward_transform = None
+        scale = self.filter_length / self.hop_length
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+
+        cutoff = int((self.filter_length / 2 + 1))
+        fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
+                                   np.imag(fourier_basis[:cutoff, :])])
+
+        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
+        inverse_basis = torch.FloatTensor(
+            np.linalg.pinv(scale * fourier_basis).T[:, None, :])
+
+        if window is not None:
+            assert(filter_length >= win_length)
+            # get window and zero center pad it to filter_length
+            fft_window = get_window(window, win_length, fftbins=True)
+            fft_window = pad_center(fft_window, filter_length)
+            fft_window = torch.from_numpy(fft_window).float()
+
+            # window the bases
+            forward_basis *= fft_window
+            inverse_basis *= fft_window
+
+        self.register_buffer('forward_basis', forward_basis.float())
+        self.register_buffer('inverse_basis', inverse_basis.float())
+
+    def transform(self, input_data):
+        num_batches = input_data.size(0)
+        num_samples = input_data.size(1)
+
+        self.num_samples = num_samples
+
+        # similar to librosa, reflect-pad the input
+        input_data = input_data.view(num_batches, 1, num_samples)
+        input_data = F.pad(
+            input_data.unsqueeze(1),
+            (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
+            mode='reflect')
+        input_data = input_data.squeeze(1)
+
+        forward_transform = F.conv1d(
+            input_data,
+            Variable(self.forward_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+
+        cutoff = int((self.filter_length / 2) + 1)
+        real_part = forward_transform[:, :cutoff, :]
+        imag_part = forward_transform[:, cutoff:, :]
+
+        magnitude = torch.sqrt(real_part**2 + imag_part**2)
+        phase = torch.autograd.Variable(
+            torch.atan2(imag_part.data, real_part.data))
+
+        return magnitude, phase # [batch_size, F(513), T(1251)]
+
+    def inverse(self, magnitude, phase):
+        recombine_magnitude_phase = torch.cat(
+            [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
+
+        inverse_transform = F.conv_transpose1d(
+            recombine_magnitude_phase,
+            Variable(self.inverse_basis, requires_grad=False),
+            stride=self.hop_length,
+            padding=0)
+
+        if self.window is not None:
+            window_sum = window_sumsquare(
+                self.window, magnitude.size(-1), hop_length=self.hop_length,
+                win_length=self.win_length, n_fft=self.filter_length,
+                dtype=np.float32)
+            # remove modulation effects
+            approx_nonzero_indices = torch.from_numpy(
+                np.where(window_sum > tiny(window_sum))[0])
+            window_sum = torch.autograd.Variable(
+                torch.from_numpy(window_sum), requires_grad=False)
+            window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
+            inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
+
+            # scale by hop ratio
+            inverse_transform *= float(self.filter_length) / self.hop_length
+
+        inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
+        inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
+
+        return inverse_transform #[batch_size, 1, sample_num]
+
+    def forward(self, input_data):
+        self.magnitude, self.phase = self.transform(input_data)
+        reconstruction = self.inverse(self.magnitude, self.phase)
+        return reconstruction 
+
+if __name__ == '__main__':
+    a = torch.randn(4, 320000)
+    stft = STFT()
+    mag, phase = stft.transform(a)
+    # rec_a = stft.inverse(mag, phase)
+    print(mag.shape)
diff --git a/sound_extraction/utils/wav_io.py b/sound_extraction/utils/wav_io.py
new file mode 100644
index 0000000..79f9916
--- /dev/null
+++ b/sound_extraction/utils/wav_io.py
@@ -0,0 +1,23 @@
+import librosa
+import librosa.filters
+import math
+import numpy as np
+import scipy.io.wavfile
+
+def load_wav(path):
+    max_length = 32000 * 10
+    wav = librosa.core.load(path, sr=32000)[0]
+    if len(wav) > max_length:
+        audio = wav[0:max_length]
+
+    # pad audio to max length, 10s for AudioCaps
+    if len(wav) < max_length:
+        # audio = torch.nn.functional.pad(audio, (0, self.max_length - audio.size(1)), 'constant')
+        wav = np.pad(wav, (0, max_length - len(wav)), 'constant')
+    wav = wav[...,None]
+    return wav
+
+
+def save_wav(wav, path):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    scipy.io.wavfile.write(path, 32000, wav.astype(np.int16))
\ No newline at end of file