Files
AudioGPT/NeuralSeq/usr/__pycache__/diffsinger_task.cpython-37.pyc

144 lines
16 KiB
Plaintext
Raw Normal View History

2023-03-20 15:43:44 +08:00
B
2023-03-24 17:19:37 +08:00
<00>Xd<>X<00>@s\ddlZddlZddlmZddlmZddlmZmZddl m
2023-03-20 15:43:44 +08:00
Z
ddl m Z m Z ddlmZdd lmZdd
lmZdd lmZdd lmZdd lmZddlmZddlmZddlZddl Z ddl!m"m#Z$dd<11>dd<11>d<13>Z%Gdd<15>de
2023-03-24 17:19:37 +08:00
<EFBFBD>Z&Gdd<17>de<1B>Z'Gdd<19>de&<26>Z(Gdd<1B>de<1B>Z)Gdd<1D>de<1B>Z*Gdd<1F>de&<26>Z+Gd d!<21>d!e<1D>Z,dS)"<22>N)<01>hparams<6D>)<01>DiffNet)<02>GaussianDiffusion<6F>OfflineGaussianDiffusion)<01>DiffSpeechTask)<02>get_vocoder_cls<6C> BaseVocoder)<01>PitchExtractor)<01> FastSpeech2)<01>FastSpeech2MIDI)<01> mel2ph_to_dur)<01>FFT)<01> denorm_f0)<01>FastSpeechDataset)<01>FastSpeech2TaskcCs t|d<00>S)N<>audio_num_mel_bins)r)<01>hp<68>r<00>V/mnt/sdc/hongzhiqing/code/audio_chatgpt/text_to_sing/DiffSinger/usr/diffsinger_task.py<70><lambda><00>rcCst|d|d|d|d<00>S)N<> hidden_size<7A>
2023-03-20 15:43:44 +08:00
dec_layers<EFBFBD>dec_ffn_kernel_size<7A> num_heads)r)rrrrrs)Zwavenet<65>fftcs,eZdZ<02>fdd<02>Zdd<04>Zdd<06>Z<05>ZS)<07>DiffSingerTaskcshtt|<00><02><02>t|_tt<06><01>|_t<06>d<01>dk rdtdrdt <09><00>
<EFBFBD>|_ t j |j tdddd<05>|j <0B><0E>dS)N<> pe_enable<6C>pe_ckpt<70>modelT)<01>strict)<0F>superr<00>__init__r<00> dataset_clsrr<00>vocoder<65>getr
<00>cuda<64>pe<70>utils<6C> load_ckpt<70>eval)<01>self)<01> __class__rrr#s  zDiffSingerTask.__init__c
Cs<>td}t|j|ttdt<00>tdtdtdtdtdd<08>|_td d
kr<>tj|jjtd d d d <0A>x|jj<07><08>D]\}}d|_ qxWdS)Nr<00>diff_decoder_type<70> timesteps<70>K_step<65>diff_loss_type<70>spec_min<69>spec_max)<08> phone_encoder<65>out_dims<6D>
denoise_fnr/r0<00> loss_typer2r3<00>fs2_ckpt<70>r T)r!F)
rrr4<00> DIFF_DECODERSr r)r*<00>fs2<73>named_parameters<72> requires_grad)r,<00>mel_bins<6E>k<>vrrr<00>build_tts_model(s  zDiffSingerTask.build_tts_modelc
Cs<>i}|d}|d}|d}tds.|<01>d<05>n|<01>d<06>}|d}|d} |d }
i|d
<|j|j|d d d <0A>\|d
<} t|d
<00><05><00>|d<|d|d<t<06>|<03>}|tdk<00>r<>|j|||| |
|dd d<11>} t<00>d<12>dk <09>rtd<00>r|<00>|d<00>d} |<00>| d<00>d} nt |d|d t<00>} | <0B>d<15>} |j
||d| dd | | d<16>|j ||d| dd|<02><00>d<18>|j ||d| dd|<02><00>d<18>|S)N<>
txt_tokens<EFBFBD>mels<6C>energy<67>
use_spk_id<EFBFBD> spk_embed<65>spk_ids<64>mel2ph<70>f0<66>uv<75>lossesTF)<02> return_output<75>infer<65>
total_loss<EFBFBD>nsamples<65>num_valid_plots)rFrHrIrJrD<00>ref_melsrMr<00>f0_denorm_pred<65>mel_out<75> f0_denorm)<03>is_mel<65>gt_f0rI<00>diffmel_)<01>name<6D>fs2_mel<65>fs2mel_) rr&<00> run_modelr <00>sum<75>valuesr)<00>tensors_to_scalarsr(r<00>plot_wav<61>plot_mel)r,<00>sample<6C> batch_idx<64>outputsrB<00>targetrDrFrHrIrJ<00> model_outrV<00>pred_f0rrr<00>validation_stepBs2 

  zDiffSingerTask.validation_step)<07>__name__<5F>
__module__<EFBFBD> __qualname__r#rArg<00> __classcell__rr)r-rrs rcs(eZdZ<02>fdd<02>Z<03>fdd<04>Z<04>ZS)<05>ShallowDiffusionOfflineDatasetcsptt|<00><02>|<01>}|<00>|<01>}|jdkrltddkrltj<07>td<00>}|d}t <09>
t <0B> |<04>d|<05>d<06><04><01>}||d<|S)N<>trainr8r9<00> item_namez /P_mels_npy/z.npyrY) r"rl<00> __getitem__<5F> _get_item<65>prefixr<00>os<6F>path<74>dirname<6D>torch<63>Tensor<6F>np<6E>load)r,<00>indexra<00>itemr8rnrY)r-rrrogs
z*ShallowDiffusionOfflineDataset.__getitem__csDtt|<00><02>|<01>}|jdkr@tddkr@t<05>dd<05>|D<00>d<06>|d<|S)Nrmr8r9cSsg|] }|d<00>qS)rYr)<02>.0<EFBFBD>srrr<00>
<listcomp>usz;ShallowDiffusionOfflineDataset.collater.<locals>.<listcomp>g<00>fs2_mels)r"rl<00>collaterrqrr)<00>
collate_2d)r,<00>samples<65>batch)r-rrrrsz'ShallowDiffusionOfflineDataset.collater)rhrirjrorrkrr)r-rrlfs rlcs>eZdZ<02>fdd<02>Zdd<04>Zd dd<07>Zdd <09>Zd
d <0B>Z<07>ZS) <0A>DiffSingerOfflineTaskcstt|<00><02><02>t|_dS)N)r"r<>r#rlr$)r,)r-rrr#zszDiffSingerOfflineTask.__init__c
CsHtd}t|j|ttdt<00>tdtdtdtdtdd<08>|_dS) Nrr.r/r0r1r2r3)r4r5r6r/r0r7r2r3)rrr4r:r )r,r>rrrrA~sz%DiffSingerOfflineTask.build_tts_modelFc
Cs<>|d}|d}|d}|d}|d} |d}
d} tdsF|<02>d<08>n|<02>d <09>} td
d kr<>|d } |d }|d}|<01>| |||<07>|d<}|||| || g|| |
|d<10>}i}d|kr<>|d|d<tdr<>|<00>|d|
|<11>|s<>|S||fSdS)NrBrCrHrIrJrDrErFrG<00>
pitch_type<EFBFBD>cwt<77>cwt_spec<65>f0_mean<61>f0_std<74>f0_cwt)rHrFrQrIrJrDrM<00> diff_loss<73>mel<65>use_energy_embed<65> energy_pred)rr&<00> cwt2f0_norm<72>add_energy_loss)r,r rarLrMrBrdrHrIrJrDrYrFr<>r<>r<><00>outputrKrrrr[<00>s.  zDiffSingerOfflineTask.run_modelc
Cs<>i}|d}|d}|d}tds.|<01>d<05>n|<01>d<06>}|d}|d} |d }
i|d
<|j|j|d d d <0A>\|d
<} t|d
<00><05><00>|d<|d|d<t<06>|<03>}|tdk<00>r<>|d} |j|||| |
|d| gd d<12>} t<00>d<13>dk <09>rtd<00>r|<00>|d<00>d} |<00>| d<00>d}nt |d|d t<00>} | <0B>d<16>}|j
||d| dd | |d<17>|j ||d| dd|<02><00>d<19>|j ||d| d|<02><00>d<19>|S)NrBrCrDrErFrGrHrIrJrKTF)rLrMrNrOrPr~)rFrHrIrJrDrQrMrrRrSrT)rUrVrIrW)rXrZ) rr&r[r r\r]r)r^r(rr_r`)r,rarbrcrBrdrDrFrHrIrJrerYrVrfrrrrg<00>s6 
 
 z%DiffSingerOfflineTask.validation_stepc
Cs&tds|<01>d<02>n|<01>d<03>}|d}|d}tdr6n<36>d\}}}tdrP|d }td
rh|d }|d }|d } |j|||||d| g|dd<0F>}
|j<02>|
d<00>|d<|
d |d<t<00>d<13>dk r<>tdr<>|<00>|d<00>d|d <|<00>|d<00>d|d<n&t|d |d t<00>|d <|
<EFBFBD>d<17>|d<|<00>|<01>SdS)NrErFrGrBrD<00> profile_infer)NNNZ
use_gt_durrHZ use_gt_f0rIrJr~T)rFrHrIrJrQrDrMrSrcZ mel2ph_predrrCrR<00>f0_predrT)rr&r <00>out2melr(rZ after_infer) r,rarbrFrBrDrHrJrIrYrcrrr<00> test_step<65>s.
 zDiffSingerOfflineTask.test_step)FF) rhrirjr#rAr[rgr<>rkrr)r-rr<>ys
 
$r<>cs(eZdZ<02>fdd<02>Z<03>fdd<04>Z<04>ZS)<05> MIDIDatasetcsNtt|<00><02>|<01>}|<00>|<01>}t<04>|d<00>|d<t<04>|d<00>dtd<00>|d<|S)N<>f0_midi<64>
pitch_midi<EFBFBD>
max_frames)r"r<>rorpru<00> FloatTensor<6F>
LongTensorr)r,ryrarz)r-rrro<00>s

zMIDIDataset.__getitem__csHtt|<00><02>|<01>}t<03>dd<02>|D<00>d<03>|d<t<03>dd<02>|D<00>d<06>|d<|S)NcSsg|] }|d<00>qS)r<>r)r{r|rrrr}<00>sz(MIDIDataset.collater.<locals>.<listcomp>gr<>cSsg|] }|d<00>qS)r<>r)r{r|rrrr}<00>srr<>)r"r<>rr)<00>
collate_1d)r,r<>r<>)r-rrr<00>szMIDIDataset.collater)rhrirjrorrkrr)r-rr<><00>s r<>cs(eZdZ<02>fdd<02>Z<03>fdd<04>Z<04>ZS)<05>OpencpopDatasetcs<>tt|<00><02>|<01>}|<00>|<01>}t<04>|d<00>dtd<00>|d<t<04>|d<00>dtd<00>|d<t<04>|d<00>dtd<00>|d<t<04>|d<00>dtd<00>|d<|S)Nr<4E>r<><00>midi_dur<75>is_slur<75> word_boundary)r"r<>rorprur<>rr<>)r,ryrarz)r-rrro<00>s
zOpencpopDataset.__getitem__cs|tt|<00><02>|<01>}t<03>dd<02>|D<00>d<03>|d<t<03>dd<02>|D<00>d<03>|d<t<03>dd<02>|D<00>d<03>|d<t<03>d d<02>|D<00>d<03>|d
<|S) NcSsg|] }|d<00>qS)r<>r)r{r|rrrr}
sz,OpencpopDataset.collater.<locals>.<listcomp>rr<>cSsg|] }|d<00>qS)r<>r)r{r|rrrr} sr<>cSsg|] }|d<00>qS)r<>r)r{r|rrrr} sr<>cSsg|] }|d<00>qS)r<>r)r{r|rrrr} sr<>)r"r<>rr)r<>)r,r<>r<>)r-rrrs zOpencpopDataset.collater)rhrirjrorrkrr)r-rr<><00>s r<>cs8eZdZ<02>fdd<02>Zd dd<05>Zdd<07>Zd d d
<EFBFBD>Z<06>ZS) <0A>DiffSingerMIDITaskcstt|<00><02><02>t|_dS)N)r"r<>r#r<>r$)r,)r-rrr#szDiffSingerMIDITask.__init__FcCsZ|d}|d}|d}t<00>d<04>dk r>|jtdkr>d}d} n|d}|d} |d}
tdsh|<02>d <09>n|<02>d
<EFBFBD>} td d kr<>|d } |d} |d}|<01>| | ||<07>|d<}|||| ||| |
||d|<02>d<12>|<02>d<13>d<14> }i}d|kr<>|d|d<|j|d|||d|d<19>td<00>r(|<00>|||<10>td<00>rD|<00>|d|
|<10>|<03>sN|S||fSdS)NrBrCrHZswitch_midi2f0_steprIrJrDrErFrGr<>r<>r<>r<>r<>r<>r<>r<>r<>)
rHrFrQrIrJrDrMr<>r<>r<>r<>r<><00>durr<72>)rK<00>use_pitch_embedr<64>r<>)rr&Z global_stepr<70><00> add_dur_loss<73>add_pitch_lossr<73>)r,r rarLrMrBrdrHrIrJrDrFr<>r<>r<>r<>rKrrrr[s:  

zDiffSingerMIDITask.run_modelc Cs<>i}|d}|d}|d}tds.|<01>d<05>n|<01>d<06>}|d}i|d<|j|j|d d
d <0B>\|d<} t|d<00><05><00>|d <|d |d <t<06>|<03>}|tdk<00>r<>|j|||dd|dd |d|<01>d<10>|<01>d<11>d<12> } t<00>d<13>dk <09>r td<00>r |<00>|d<00>d}
|<00>| d<00>d} nt |d|dt<00>}
| <09>d<18>} |j
||d| dd |
| d<19>|j ||d| dd|<02><00>d<1B>|j ||d| dd|<02><00>d<1B>td<00>r<>|<00> ||| <09>|S)NrBrCrDrErFrGrHrKTF)rLrMrNrOrPr<>r<>r<>)
rFrHrIrJrDrQrMr<>r<>r<>rrRrSrIrJrT)rUrVrIrW)rXrYrZr<>) rr&r[r r\r]r)r^r(rr_r`<00>
plot_pitch) r,rarbrcrBrdrDrFrHrerVrfrrrrg<s4 

  
z"DiffSingerMIDITask.validation_stepNcCs<>|j\}}|dk<03><01>}t||<07><02><01>|} t<03>|<03><01><05>}
x&|jD]} |
||j<07>| <0B>dkB}
q>W|
<EFBFBD><01>}
t ddkr<>t
j || d<00> <0C>dd<06>|d<|d|<00> <0A>|<08> <0A>|d<|<01><0E>djdd<08>}nt<10>t d dk<04>r<>t
<EFBFBD>|jdd
<EFBFBD>d <0B>d d <0C>d d <0A>f} |<01>|| <0C><14>dg<02><01>d| |<01>} | <09>|| <0C><14>dg<02><01>d| | <09>}t
j | d<00> <0C>|d<00> <0C>dd<06>}|dk<04><01>}||<00> <0A>|<10> <0A>}|t d |d<t ddk<04>r<>|<01> d <0A>}| <09> d <0A>}t
j |d<00> <0C>|d<00> <0C>dd<06>}|<13><16>t d|d<d S)z<>
:param dur_pred: [B, T], float, log scale
:param mel2ph: [B, T]
:param txt_tokens: [B, T]
:param losses:
:return:
r<00>dur_loss<73>mser<00>none)<01> reduction<6F>pdur)<01>min<69>lambda_word_dur)<01>axis)rrN<><4E><EFBFBD><EFBFBD><EFBFBD><EFBFBD>wdur<75>lambda_sent_dur<75>mean<61>sdur)<17>shape<70>floatr ru<00>
zeros_like<EFBFBD>bool<6F>sil_phr4<00>encoder<00>F<>mse_loss<73>logr\<00>exp<78>clamp<6D>NotImplementedError<6F>pad<61>cumsum<75> new_zeros<6F>max<61> scatter_addr<64>)r,<00>dur_predrHrB<00>wdbrK<00>B<>T<>
nonpadding<EFBFBD>dur_gt<67>is_sil<69>p<>idx<64>
word_dur_p<EFBFBD>
word_dur_g<EFBFBD> wdur_loss<73>word_nonpadding<6E>
sent_dur_p<EFBFBD>
sent_dur_g<EFBFBD> sdur_lossrrrr<>_s2
   $    

 zDiffSingerMIDITask.add_dur_loss)FF)N)rhrirjr#r[rgr<>rkrr)r-rr<>s 
%#r<>cs@eZdZ<02>fdd<02>Zdd<04>Zd dd<07>Zdd d
<EFBFBD>Zd d <0C>Z<07>ZS)<0F>AuxDecoderMIDITaskcst<00><00><01>t|_dS)N)r"r#r<>r$)r,)r-rrr#<00>s
zAuxDecoderMIDITask.__init__cCs4t<00>d<01>dk r$tdr$t|j<03>|_n t|j<03>|_dS)N<>use_midi)rr&r r4r r )r,rrrrA<00>sz"AuxDecoderMIDITask.build_tts_modelFcCs0|d}|d}|d}|d}|d}|d} tdsB|<02>d<08>n|<02>d <09>}
td
d kr<>|d } |d } |d} |<01>| | | |<06>|d<}||||
|||| d|d|<02>d<12>|<02>d<13>d<14> }i}|<00>|d||<0F>|j|d|||d|d<18>tdr<>|<00>|||<0F>td<00>r|<00>|d| |<0F>|<03>s$|S||fSdS)NrBrCrHrIrJrDrErFrGr<>r<>r<>r<>r<>r<>Fr<46>r<>r<>)
rHrFrQrIrJrDrMr<>r<>r<>rSr<>r<>)rKr<>r<>r<>)rr&r<>Z add_mel_lossr<73>r<>r<>)r,r rarLrBrdrHrIrJrDrFr<>r<>r<>r<>rKrrrr[<00>s2 
zAuxDecoderMIDITask.run_modelNcCs<>|j\}}|dk<03><01>}t||<07><02><01>|} t<03>|<03><01><05>}
x&|jD]} |
||j<07>| <0B>dkB}
q>W|
<EFBFBD><01>}
t ddkr<>t
j || d<00> <0C>dd<06>|d<|d|<00> <0A>|<08> <0A>|d<|<01><0E>djdd<08>}nt<10>t d dk<04>r<>t
<EFBFBD>|jdd
<EFBFBD>d <0B>d d <0C>d d <0A>f} |<01>|| <0C><14>dg<02><01>d| |<01>} | <09>|| <0C><14>dg<02><01>d| | <09>}t
j | d<00> <0C>|d<00> <0C>dd<06>}|dk<04><01>}||<00> <0A>|<10> <0A>}|t d |d<t ddk<04>r<>|<01> d <0A>}| <09> d <0A>}t
j |d<00> <0C>|d<00> <0C>dd<06>}|<13><16>t d|d<d S)z<>
:param dur_pred: [B, T], float, log scale
:param mel2ph: [B, T]
:param txt_tokens: [B, T]
:param losses:
:return:
rr<>r<>rr<>)r<>r<>)r<>r<>)r<>)rrNr<4E>r<>r<>r<>r<>)r<>r<>r rur<>r<>r<>r4r<>rr<>r<>r<>r\r<>r<>r<>r<>r<>r<>r<>r<>r<>)r,r<>rHrBr<>rKr<>r<>r<>r<>r<>r<>r<>r<>r<>r<>r<>r<>r<>r<>rrrr<><00>s2
   $    

 zAuxDecoderMIDITask.add_dur_losscCs<>i}i|d<|j|j|dd<03>\|d<}t|d<00><03><00>|d<|d|d<|j<01>|d<00>}t<05>|<03>}|tdkr<>|<00>||d|<05>|<00> |||<04>td r<>|<00>
|||<04>|S)
NrKT)rLrNrOrSrPrCr<>) r[r r\r]r<>r)r^rr`Zplot_durr<72>)r,rarbrcrerSrrrrg<00>s 
 z"AuxDecoderMIDITask.validation_step)F)N) rhrirjr#rAr[r<>rgrkrr)r-rr<><00>s
 

(r<>)-rur)<00> utils.hparamsrZdiff.netrZdiff.shallow_diffusion_ttsrrZdiffspeech_taskr<00>vocoders.base_vocoderrr <00>modules.fastspeech.per
<00>modules.fastspeech.fs2r <00>modules.diffsinger_midi.fs2r <00>modules.fastspeech.tts_modulesr Zusr.diff.candidate_decoderr<00>utils.pitch_utilsrZtasks.tts.fs2_utilsrZ tasks.tts.fs2r<00>numpyrwrrZtorch.nn.functional<61>nn<6E>
functionalr<EFBFBD>r:rrlr<>r<>r<>r<>r<>rrrr<00><module>s4            Htw