Wuro — Text-to-Speech for Bambara and Bomu
Wuro is a text-to-speech (TTS) model designed to generate speech in Bambara and Bomu from text.
Description
This model was developed for speech synthesis in Malian languages, especially Bambara and Bomu.
It uses an autoregressive approach that combines tokenized text with compressed audio codes.
Supported Languages
- Bambara (
bm) - Bomu (
bmq)
Training Data
The model was trained on a TTS dataset built from multiple sources in Bambara and Bomu.
Dataset used:
Base model:
Features
- Text-to-speech generation
- Multilingual support (Bambara and Bomu)
- Audio generation based on compressed SNAC tokens
Usage
Installation
pip install unsloth snac bambara-text-normalizer
from unsloth import FastLanguageModel
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="YazoPi/Wuro",
max_seq_length=4096,
dtype=None,
load_in_4bit=False,
)
FastLanguageModel.for_inference(model)
Normalisation du Bambara
The bambara_normalizer module is used only for Bambara. It is used to normalize certain parts of the text before generation.
For more information on how to use it, see: bambara-text-normalizer
import torch
from snac import SNAC
import soundfile as sf
import numpy as np
from unsloth import FastLanguageModel
from bambara_normalizer import normalize_dates_in_text, normalize_measurements_in_text, normalize_numbers_in_text, normalize_times_in_text
CODE_START_TOKEN_ID = 128257
CODE_END_TOKEN_ID = 128258
CODE_TOKEN_OFFSET = 128266
SNAC_MIN_ID = 128266
SNAC_MAX_ID = 156937
SNAC_TOKENS_PER_FRAME = 7
SOH_ID = 128259
EOH_ID = 128260
SOA_ID = 128261
BOS_ID = 128000
TEXT_EOT_ID = 128009
tokeniser_length = 128256
pad_token = tokeniser_length + 7
def build_prompt(tokenizer, description: str, text: str, normalize = False) -> str:
"""Build formatted prompt for the model."""
soh_token = tokenizer.decode([SOH_ID])
eoh_token = tokenizer.decode([EOH_ID])
soa_token = tokenizer.decode([SOA_ID])
sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
eot_token = tokenizer.decode([TEXT_EOT_ID])
bos_token = tokenizer.bos_token
if normalize:
text = normalize_dates_in_text(text)
text = normalize_times_in_text(text)
text = normalize_measurements_in_text(text)
text = normalize_numbers_in_text(text)
#print(text)
formatted_text = f'<description="{description}"> {text}'
prompt = (
soh_token + bos_token + formatted_text + eot_token +
eoh_token + soa_token + sos_token
)
return prompt
def extract_snac_codes(token_ids: list) -> list:
"""Extract SNAC codes from generated tokens."""
try:
eos_idx = token_ids.index(CODE_END_TOKEN_ID)
except ValueError:
eos_idx = len(token_ids)
snac_codes = [
token_id for token_id in token_ids[:eos_idx]
if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID
]
return snac_codes
def unpack_snac_from_7(snac_tokens: list) -> list:
"""Unpack 7-token SNAC frames to 3 hierarchical levels."""
if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
snac_tokens = snac_tokens[:-1]
frames = len(snac_tokens) // SNAC_TOKENS_PER_FRAME
snac_tokens = snac_tokens[:frames * SNAC_TOKENS_PER_FRAME]
if frames == 0:
return [[], [], []]
l1, l2, l3 = [], [], []
for i in range(frames):
slots = snac_tokens[i * 7:(i + 1) * 7]
l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
l2.extend([
(slots[1] - CODE_TOKEN_OFFSET) % 4096,
(slots[4] - CODE_TOKEN_OFFSET) % 4096,
])
l3.extend([
(slots[2] - CODE_TOKEN_OFFSET) % 4096,
(slots[3] - CODE_TOKEN_OFFSET) % 4096,
(slots[5] - CODE_TOKEN_OFFSET) % 4096,
(slots[6] - CODE_TOKEN_OFFSET) % 4096,
])
return [l1, l2, l3]
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
if torch.cuda.is_available():
device = "cuda"
snac_model = snac_model.to(device)
snac_model.quantizer = snac_model.quantizer.to(device)
snac_model.decoder = snac_model.decoder.to(device)
else:
device = "cpu"
def main(
description="bomu",
text="bwe wa wuro.",
temp=1.0,
top_p=0.95,
max_tokens=4096,
min_tokens=28,
rp=1.2,
do_sample = True,
normalize = False
):
prompt = build_prompt(tokenizer, description, text, normalize)
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.inference_mode():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
min_new_tokens=min_tokens,
temperature=temp,
top_p=top_p,
repetition_penalty=rp,
do_sample=do_sample,
eos_token_id=CODE_END_TOKEN_ID,
pad_token_id=tokenizer.pad_token_id
)
generated_ids = outputs[0, inputs["input_ids"].shape[1]:].tolist()
if CODE_END_TOKEN_ID in generated_ids:
eos_position = generated_ids.index(CODE_END_TOKEN_ID)
snac_tokens = extract_snac_codes(generated_ids)
snac_count = sum(1 for t in generated_ids if SNAC_MIN_ID <= t <= SNAC_MAX_ID)
other_count = sum(1 for t in generated_ids if t < SNAC_MIN_ID or t > SNAC_MAX_ID)
if CODE_START_TOKEN_ID in generated_ids:
sos_pos = generated_ids.index(CODE_START_TOKEN_ID)
else:
print("No SOS token found in generated output!")
if len(snac_tokens) < 7:
print("Error: Not enough SNAC tokens generated")
return
levels = unpack_snac_from_7(snac_tokens)
frames = len(levels[0])
print(f"Unpacked to {frames} frames")
print(f"L1: {len(levels[0])} codes")
print(f"L2: {len(levels[1])} codes")
print(f"L3: {len(levels[2])} codes")
codes_tensor = [
torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0)
for level in levels
]
print("Decoding to audio...")
with torch.inference_mode():
z_q = snac_model.quantizer.from_codes(codes_tensor)
audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
if len(audio) > max_tokens:
audio = audio[max_tokens:]
duration_sec = len(audio) / 24000
print(f"Audio generated: {len(audio)} samples ({duration_sec:.2f}s)")
output_file = f"{description}-{text}.wav"
sf.write(output_file, audio, 24000)
Example with Bomu
A Bomu audio sample.
main(
description="bomu",
text="Hee wa banu yɛrɛ wuro.",
max_tokens=2048,
temp=0.4,
top_p=0.9,
rp=1.1
)
You can also read a Bambara text with a Bwa (Bomu) accent:
main(
description="bomu",
text="An me kɛrɛlamana dɔw don a tigilamɔgɔ la.",
max_tokens=2048,
temp=0.4,
top_p=0.9,
rp=1.1,
)
Exemple with Bambara
main(
description="A Male voice with a bambara accent.",
text="Mali ye anw faso de ye !",
max_tokens=2048,
temp=0.4,
top_p=0.9,
rp=1.1
)
Exemple avec la normalisation du texte
The bambara-text-normalizer package is very useful for normalizing text before speech synthesis:
main(
description="Clear bambara voice.",
text="Ne taara sugu la 24-12-2025 la, 10:45 waati, ne ye tulu 6 l san ani sukaro 10 kg.",
normalize = True,
do_sample = False
)
Read a Bomu text with a Bambara accent:
main(
description="bambara",
text="Yacouba, yɛrɛ we zin ma nucoza ue.",
max_tokens=2048,
temp=0.4,
top_p=0.9,
rp=1.1
)
Exemple with English
main(
description="bomu",
text="Hello! This is RobotsMali TTS model!",
max_tokens=2048,
temp=0.4,
top_p=0.9,
rp=1.1,
do_sample = False
)
License
Non-commercial use due to some data used in model training.
Model tree for RobotsMali/Wuro
Base model
maya-research/maya1