| semantic_codec: | |
| conf: | |
| codebook_size: 8192 | |
| hidden_size: 1024 | |
| codebook_dim: 8 | |
| vocos_dim: 384 | |
| vocos_intermediate_dim: 2048 | |
| vocos_num_layers: 12 | |
| checkpoint: semantic_codec.safetensors | |
| s2mel: | |
| preprocess_params: | |
| sr: 22050 | |
| spect_params: | |
| n_fft: 1024 | |
| win_length: 1024 | |
| hop_length: 256 | |
| n_mels: 80 | |
| fmin: 0 | |
| fmax: "None" | |
| dit_type: "DiT" | |
| reg_loss_type: "l1" | |
| style_encoder: | |
| dim: 192 | |
| length_regulator: | |
| channels: 512 | |
| is_discrete: false | |
| in_channels: 1024 | |
| content_codebook_size: 2048 | |
| sampling_ratios: [1, 1, 1, 1] | |
| vector_quantize: false | |
| n_codebooks: 1 | |
| quantizer_dropout: 0.0 | |
| f0_condition: false | |
| n_f0_bins: 512 | |
| DiT: | |
| hidden_dim: 512 | |
| num_heads: 8 | |
| depth: 13 | |
| class_dropout_prob: 0.1 | |
| block_size: 8192 | |
| in_channels: 80 | |
| style_condition: true | |
| final_layer_type: 'wavenet' | |
| target: 'mel' | |
| content_dim: 512 | |
| content_codebook_size: 1024 | |
| content_type: 'discrete' | |
| f0_condition: false | |
| n_f0_bins: 512 | |
| content_codebooks: 1 | |
| is_causal: false | |
| long_skip_connection: true | |
| zero_prompt_speech_token: false | |
| time_as_token: false | |
| style_as_token: false | |
| uvit_skip_connection: true | |
| add_resblock_in_transformer: false | |
| wavenet: | |
| hidden_dim: 512 | |
| num_layers: 8 | |
| kernel_size: 5 | |
| dilation_rate: 1 | |
| p_dropout: 0.2 | |
| style_condition: true | |
| gpt_checkpoint: gpt.pth | |
| s2mel_checkpoint: s2mel.pth | |
| vocoder: | |
| type: "bigvgan" | |
| name: "nvidia/bigvgan_v2_22khz_80band_256x" |