tiny-audio / diarization.py

Training in progress - step 1000

8e30b59 verified 18 days ago

26.8 kB

	"""Speaker diarization using TEN-VAD + ECAPA-TDNN + spectral clustering.

	Spectral clustering implementation adapted from FunASR/3D-Speaker:
	https://github.com/alibaba-damo-academy/FunASR
	MIT License (https://opensource.org/licenses/MIT)
	"""

	import warnings

	import numpy as np
	import scipy
	import sklearn.metrics.pairwise
	import torch
	from sklearn.cluster._kmeans import k_means
	from sklearn.preprocessing import normalize


	def _get_device() -> torch.device:
	"""Get best available device for inference."""
	if torch.cuda.is_available():
	return torch.device("cuda")
	if torch.backends.mps.is_available():
	return torch.device("mps")
	return torch.device("cpu")


	class SpectralCluster:
	"""Spectral clustering using unnormalized Laplacian of affinity matrix.

	Adapted from FunASR/3D-Speaker and SpeechBrain implementations.
	Uses eigenvalue gap to automatically determine number of speakers.
	"""

	def __init__(self, min_num_spks: int = 1, max_num_spks: int = 15, pval: float = 0.06):
	self.min_num_spks = min_num_spks
	self.max_num_spks = max_num_spks
	self.pval = pval

	def __call__(self, embeddings: np.ndarray, oracle_num: int \| None = None) -> np.ndarray:
	"""Run spectral clustering on embeddings.

	Args:
	embeddings: Speaker embeddings of shape [N, D]
	oracle_num: Optional known number of speakers

	Returns:
	Cluster labels of shape [N]
	"""
	# Similarity matrix computation
	sim_mat = self.get_sim_mat(embeddings)

	# Refining similarity matrix with pval
	prunned_sim_mat = self.p_pruning(sim_mat)

	# Symmetrization
	sym_prund_sim_mat = 0.5 * (prunned_sim_mat + prunned_sim_mat.T)

	# Laplacian calculation
	laplacian = self.get_laplacian(sym_prund_sim_mat)

	# Get Spectral Embeddings
	emb, num_of_spk = self.get_spec_embs(laplacian, oracle_num)

	# Perform clustering
	return self.cluster_embs(emb, num_of_spk)

	def get_sim_mat(self, embeddings: np.ndarray) -> np.ndarray:
	"""Compute cosine similarity matrix."""
	return sklearn.metrics.pairwise.cosine_similarity(embeddings, embeddings)

	def p_pruning(self, affinity: np.ndarray) -> np.ndarray:
	"""Prune low similarity values in affinity matrix (keep top pval fraction)."""
	n = affinity.shape[0]
	pval = max(self.pval, 6.0 / n)
	k_keep = max(1, int(pval * n))

	# Vectorized: find top-k indices per row and zero out the rest
	top_k_idx = np.argpartition(affinity, -k_keep, axis=1)[:, -k_keep:]
	mask = np.zeros_like(affinity, dtype=bool)
	np.put_along_axis(mask, top_k_idx, True, axis=1)
	affinity[~mask] = 0
	return affinity

	def get_laplacian(self, sim_mat: np.ndarray) -> np.ndarray:
	"""Compute unnormalized Laplacian matrix."""
	from scipy.sparse.csgraph import laplacian

	np.fill_diagonal(sim_mat, 0)
	return laplacian(sim_mat, normed=False)

	def get_spec_embs(
	self, laplacian: np.ndarray, k_oracle: int \| None = None
	) -> tuple[np.ndarray, int]:
	"""Extract spectral embeddings from Laplacian."""
	lambdas, eig_vecs = scipy.linalg.eigh(laplacian)

	if k_oracle is not None:
	num_of_spk = k_oracle
	else:
	lambda_gap_list = self.get_eigen_gaps(
	lambdas[self.min_num_spks - 1 : self.max_num_spks + 1]
	)
	num_of_spk = np.argmax(lambda_gap_list) + self.min_num_spks

	emb = eig_vecs[:, :num_of_spk]
	return emb, num_of_spk

	def cluster_embs(self, emb: np.ndarray, k: int) -> np.ndarray:
	"""Cluster spectral embeddings using k-means."""
	_, labels, _ = k_means(emb, k, n_init=10)
	return labels

	def get_eigen_gaps(self, eig_vals: np.ndarray) -> np.ndarray:
	"""Compute gaps between consecutive eigenvalues."""
	return np.diff(eig_vals)


	class SpeakerClusterer:
	"""Speaker clustering backend using spectral clustering with speaker merging.

	Features:
	- Spectral clustering with eigenvalue gap for auto speaker count detection
	- P-pruning for affinity matrix refinement
	- Post-clustering speaker merging by cosine similarity
	"""

	def __init__(
	self,
	min_num_spks: int = 2,
	max_num_spks: int = 10,
	merge_thr: float = 0.90, # Moderate merging
	):
	self.min_num_spks = min_num_spks
	self.max_num_spks = max_num_spks
	self.merge_thr = merge_thr
	self._spectral_cluster: SpectralCluster \| None = None

	def _get_spectral_cluster(self) -> SpectralCluster:
	"""Lazy-load spectral clusterer."""
	if self._spectral_cluster is None:
	self._spectral_cluster = SpectralCluster(
	min_num_spks=self.min_num_spks,
	max_num_spks=self.max_num_spks,
	)
	return self._spectral_cluster

	def __call__(self, embeddings: np.ndarray, num_speakers: int \| None = None) -> np.ndarray:
	"""Cluster speaker embeddings and return labels.

	Args:
	embeddings: Speaker embeddings of shape [N, D]
	num_speakers: Optional oracle number of speakers

	Returns:
	Cluster labels of shape [N]
	"""
	if len(embeddings.shape) != 2:
	raise ValueError(f"Expected 2D array, got shape {embeddings.shape}")

	# Handle edge cases
	if embeddings.shape[0] == 0:
	return np.array([], dtype=int)
	if embeddings.shape[0] == 1:
	return np.array([0], dtype=int)
	if embeddings.shape[0] < 6:
	return np.zeros(embeddings.shape[0], dtype=int)

	# Normalize embeddings and replace NaN/inf
	embeddings = np.nan_to_num(embeddings, nan=0.0, posinf=0.0, neginf=0.0)
	embeddings = normalize(embeddings)

	# Run spectral clustering (suppress numerical warnings)
	spectral = self._get_spectral_cluster()

	# Update min/max for oracle case
	if num_speakers is not None:
	spectral.min_num_spks = num_speakers
	spectral.max_num_spks = num_speakers

	with warnings.catch_warnings():
	warnings.filterwarnings("ignore", category=RuntimeWarning)
	labels = spectral(embeddings, oracle_num=num_speakers)

	# Reset min/max
	if num_speakers is not None:
	spectral.min_num_spks = self.min_num_spks
	spectral.max_num_spks = self.max_num_spks

	# Merge similar speakers if no oracle
	if num_speakers is None:
	labels = self._merge_by_cos(labels, embeddings, self.merge_thr)

	# Re-index labels sequentially
	_, labels = np.unique(labels, return_inverse=True)

	return labels

	def _merge_by_cos(self, labels: np.ndarray, embs: np.ndarray, cos_thr: float) -> np.ndarray:
	"""Merge similar speakers by cosine similarity of centroids."""
	from scipy.cluster.hierarchy import fcluster, linkage
	from scipy.spatial.distance import pdist

	unique_labels = np.unique(labels)
	if len(unique_labels) <= 1:
	return labels

	# Compute normalized speaker centroids
	centroids = np.array([embs[labels == lbl].mean(0) for lbl in unique_labels])
	centroids = normalize(centroids)

	# Hierarchical clustering with cosine distance
	distances = pdist(centroids, metric="cosine")
	linkage_matrix = linkage(distances, method="average")
	merged_labels = fcluster(linkage_matrix, t=1.0 - cos_thr, criterion="distance") - 1

	# Map original labels to merged labels
	label_map = dict(zip(unique_labels, merged_labels))
	return np.array([label_map[lbl] for lbl in labels])


	class LocalSpeakerDiarizer:
	"""Local speaker diarization using TEN-VAD + ECAPA-TDNN + spectral clustering.

	Pipeline:
	1. TEN-VAD detects speech segments
	2. Sliding window (1.0s, 75% overlap) for uniform embedding extraction
	3. ECAPA-TDNN extracts speaker embeddings per window
	4. Spectral clustering with eigenvalue gap for auto speaker detection
	5. Frame-level consensus voting for segment reconstruction
	6. Post-processing merges short segments to reduce flicker

	Tunable Parameters (class attributes):
	- WINDOW_SIZE: Embedding extraction window size in seconds
	- STEP_SIZE: Sliding window step size (overlap = WINDOW_SIZE - STEP_SIZE)
	- VAD_THRESHOLD: Speech detection threshold (lower = more sensitive)
	- VAD_MIN_DURATION: Minimum speech segment duration
	- VAD_MAX_GAP: Maximum gap to bridge between segments
	- VAD_PAD_ONSET/OFFSET: Padding added to speech segments
	- VOTING_RATE: Frame resolution for consensus voting
	- MIN_SEGMENT_DURATION: Minimum final segment duration
	- SAME_SPEAKER_GAP: Maximum gap to merge same-speaker segments
	- TAIL_COVERAGE_RATIO: Minimum tail coverage to add extra window
	"""

	_ten_vad_model = None
	_ecapa_model = None
	_device = None

	# ==================== TUNABLE PARAMETERS ====================

	# Sliding window for embedding extraction
	WINDOW_SIZE = 0.75 # seconds - shorter window for finer resolution
	STEP_SIZE = 0.15 # seconds (80% overlap for more votes)
	TAIL_COVERAGE_RATIO = 0.1 # Add extra window if tail > this ratio of window

	# VAD hysteresis parameters
	VAD_THRESHOLD = 0.25 # Balanced threshold
	VAD_MIN_DURATION = 0.05 # Minimum speech segment duration (seconds)
	VAD_MAX_GAP = 0.50 # Bridge gaps shorter than this (seconds)
	VAD_PAD_ONSET = 0.05 # Padding at segment start (seconds)
	VAD_PAD_OFFSET = 0.05 # Padding at segment end (seconds)

	# Frame-level voting
	VOTING_RATE = 0.01 # 10ms resolution for consensus voting

	# Post-processing
	MIN_SEGMENT_DURATION = 0.15 # Minimum final segment duration (seconds)
	SHORT_SEGMENT_GAP = 0.1 # Gap threshold for merging short segments
	SAME_SPEAKER_GAP = 0.5 # Gap threshold for merging same-speaker segments

	# ===========================================================

	@classmethod
	def _get_ten_vad_model(cls):
	"""Lazy-load TEN-VAD model (singleton)."""
	if cls._ten_vad_model is None:
	from ten_vad import TenVad

	cls._ten_vad_model = TenVad(hop_size=256, threshold=cls.VAD_THRESHOLD)
	return cls._ten_vad_model

	@classmethod
	def _get_device(cls) -> torch.device:
	"""Get the best available device."""
	if cls._device is None:
	cls._device = _get_device()
	return cls._device

	@classmethod
	def _get_ecapa_model(cls):
	"""Lazy-load ECAPA-TDNN speaker embedding model (singleton)."""
	if cls._ecapa_model is None:
	# Suppress torchaudio deprecation warning from SpeechBrain
	with warnings.catch_warnings():
	warnings.filterwarnings("ignore", message="torchaudio._backend")
	from speechbrain.inference.speaker import EncoderClassifier

	device = cls._get_device()
	cls._ecapa_model = EncoderClassifier.from_hparams(
	source="speechbrain/spkrec-ecapa-voxceleb",
	run_opts={"device": str(device)},
	)

	return cls._ecapa_model

	@classmethod
	def diarize(
	cls,
	audio: np.ndarray \| str,
	sample_rate: int = 16000,
	num_speakers: int \| None = None,
	min_speakers: int = 2,
	max_speakers: int = 10,
	**_kwargs,
	) -> list[dict]:
	"""Run speaker diarization on audio.

	Args:
	audio: Audio waveform as numpy array or path to audio file
	sample_rate: Audio sample rate (default 16000)
	num_speakers: Exact number of speakers (if known)
	min_speakers: Minimum number of speakers
	max_speakers: Maximum number of speakers

	Returns:
	List of dicts with 'speaker', 'start', 'end' keys
	"""
	# Handle file path input
	if isinstance(audio, str):
	import librosa

	audio, sample_rate = librosa.load(audio, sr=16000)

	# Ensure correct sample rate
	if sample_rate != 16000:
	import librosa

	audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
	sample_rate = 16000

	audio = audio.astype(np.float32)
	total_duration = len(audio) / sample_rate

	# Step 1: VAD (returns segments and raw frame-level decisions)
	segments, vad_frames = cls._get_speech_segments(audio, sample_rate)
	if not segments:
	return []

	# Step 2: Extract embeddings
	embeddings, window_segments = cls._extract_embeddings(audio, segments, sample_rate)
	if len(embeddings) == 0:
	return []

	# Step 3: Cluster
	clusterer = SpeakerClusterer(min_num_spks=min_speakers, max_num_spks=max_speakers)
	labels = clusterer(embeddings, num_speakers)

	# Step 4: Post-process with consensus voting (VAD-aware)
	return cls._postprocess_segments(window_segments, labels, total_duration, vad_frames)

	@classmethod
	def _get_speech_segments(
	cls, audio_array: np.ndarray, sample_rate: int = 16000
	) -> tuple[list[dict], list[bool]]:
	"""Get speech segments using TEN-VAD.

	Returns:
	Tuple of (segments list, vad_frames list of per-frame speech decisions)
	"""
	vad_model = cls._get_ten_vad_model()

	# Convert to int16 as required by TEN-VAD
	# Clip to prevent integer overflow
	if audio_array.dtype != np.int16:
	audio_int16 = (np.clip(audio_array, -1.0, 1.0) * 32767).astype(np.int16)
	else:
	audio_int16 = audio_array

	# Process frame by frame
	hop_size = 256
	frame_duration = hop_size / sample_rate
	speech_frames: list[bool] = []

	for i in range(0, len(audio_int16) - hop_size, hop_size):
	frame = audio_int16[i : i + hop_size]
	_, is_speech = vad_model.process(frame)
	speech_frames.append(is_speech)

	# Convert frame-level decisions to segments
	segments = []
	in_speech = False
	start_idx = 0

	for i, is_speech in enumerate(speech_frames):
	if is_speech and not in_speech:
	start_idx = i
	in_speech = True
	elif not is_speech and in_speech:
	start_time = start_idx * frame_duration
	end_time = i * frame_duration
	segments.append(
	{
	"start": start_time,
	"end": end_time,
	"start_sample": int(start_time * sample_rate),
	"end_sample": int(end_time * sample_rate),
	}
	)
	in_speech = False

	# Handle trailing speech
	if in_speech:
	start_time = start_idx * frame_duration
	end_time = len(speech_frames) * frame_duration
	segments.append(
	{
	"start": start_time,
	"end": end_time,
	"start_sample": int(start_time * sample_rate),
	"end_sample": int(end_time * sample_rate),
	}
	)

	return cls._apply_vad_hysteresis(segments, sample_rate), speech_frames

	@classmethod
	def _apply_vad_hysteresis(cls, segments: list[dict], sample_rate: int = 16000) -> list[dict]:
	"""Apply hysteresis-like post-processing to VAD segments."""
	if not segments:
	return segments

	segments = sorted(segments, key=lambda x: x["start"])

	# Fill short gaps
	merged = [segments[0].copy()]
	for seg in segments[1:]:
	gap = seg["start"] - merged[-1]["end"]
	if gap <= cls.VAD_MAX_GAP:
	merged[-1]["end"] = seg["end"]
	merged[-1]["end_sample"] = seg["end_sample"]
	else:
	merged.append(seg.copy())

	# Remove short segments
	filtered = [seg for seg in merged if (seg["end"] - seg["start"]) >= cls.VAD_MIN_DURATION]

	# Dilate segments (add padding)
	for seg in filtered:
	seg["start"] = max(0.0, seg["start"] - cls.VAD_PAD_ONSET)
	seg["end"] = seg["end"] + cls.VAD_PAD_OFFSET
	seg["start_sample"] = int(seg["start"] * sample_rate)
	seg["end_sample"] = int(seg["end"] * sample_rate)

	return filtered

	@classmethod
	def _extract_embeddings(
	cls, audio_array: np.ndarray, segments: list[dict], sample_rate: int
	) -> tuple[np.ndarray, list[dict]]:
	"""Extract speaker embeddings using sliding windows."""
	speaker_model = cls._get_ecapa_model()

	window_samples = int(cls.WINDOW_SIZE * sample_rate)
	step_samples = int(cls.STEP_SIZE * sample_rate)

	embeddings = []
	window_segments = []

	with torch.no_grad():
	for seg in segments:
	seg_start = seg["start_sample"]
	seg_end = seg["end_sample"]
	seg_len = seg_end - seg_start

	# Generate window positions
	if seg_len <= window_samples:
	starts = [seg_start]
	ends = [seg_end]
	else:
	starts = list(range(seg_start, seg_end - window_samples + 1, step_samples))
	ends = [s + window_samples for s in starts]

	# Cover tail if > TAIL_COVERAGE_RATIO of window remains
	if ends and ends[-1] < seg_end:
	remainder = seg_end - ends[-1]
	if remainder > (window_samples * cls.TAIL_COVERAGE_RATIO):
	starts.append(seg_end - window_samples)
	ends.append(seg_end)

	for c_start, c_end in zip(starts, ends):
	chunk = audio_array[c_start:c_end]

	# Pad short chunks with reflection
	if len(chunk) < window_samples:
	pad_width = window_samples - len(chunk)
	chunk = np.pad(chunk, (0, pad_width), mode="reflect")

	# Extract embedding using SpeechBrain's encode_batch
	chunk_tensor = torch.from_numpy(chunk).float().unsqueeze(0)
	embedding = (
	speaker_model.encode_batch(chunk_tensor).squeeze(0).squeeze(0).cpu().numpy()
	)

	# Validate embedding
	if np.isfinite(embedding).all() and np.linalg.norm(embedding) > 1e-8:
	embeddings.append(embedding)
	window_segments.append(
	{
	"start": c_start / sample_rate,
	"end": c_end / sample_rate,
	}
	)

	# Normalize all embeddings at once
	if embeddings:
	return normalize(np.array(embeddings)), window_segments
	return np.array([]), []

	@classmethod
	def _resample_vad(cls, vad_frames: list[bool], num_frames: int) -> np.ndarray:
	"""Resample VAD frame decisions to match voting grid resolution.

	VAD operates at 256 samples / 16000 Hz = 16ms per frame.
	Voting operates at VOTING_RATE (default 10ms) per frame.
	This maps VAD decisions to the finer voting grid.
	"""
	if not vad_frames:
	return np.zeros(num_frames, dtype=bool)

	vad_rate = 256 / 16000 # 16ms per VAD frame
	vad_arr = np.array(vad_frames)

	# Vectorized: compute VAD frame indices for each voting frame
	voting_times = np.arange(num_frames) * cls.VOTING_RATE
	vad_indices = np.clip((voting_times / vad_rate).astype(int), 0, len(vad_arr) - 1)
	return vad_arr[vad_indices]

	@classmethod
	def _postprocess_segments(
	cls,
	window_segments: list[dict],
	labels: np.ndarray,
	total_duration: float,
	vad_frames: list[bool],
	) -> list[dict]:
	"""Post-process using frame-level consensus voting with VAD-aware silence."""
	if not window_segments or len(labels) == 0:
	return []

	# Correct labels to be contiguous
	unique_labels = np.unique(labels)
	label_map = {old: new for new, old in enumerate(unique_labels)}
	clean_labels = np.array([label_map[lbl] for lbl in labels])
	num_speakers = len(unique_labels)

	if num_speakers == 0:
	return []

	# Create voting grid
	num_frames = int(np.ceil(total_duration / cls.VOTING_RATE)) + 1
	votes = np.zeros((num_frames, num_speakers), dtype=np.float32)

	# Accumulate votes
	for win, label in zip(window_segments, clean_labels):
	start_frame = int(win["start"] / cls.VOTING_RATE)
	end_frame = int(win["end"] / cls.VOTING_RATE)
	end_frame = min(end_frame, num_frames)
	if start_frame < end_frame:
	votes[start_frame:end_frame, label] += 1.0

	# Determine winner per frame
	frame_speakers = np.argmax(votes, axis=1)
	max_votes = np.max(votes, axis=1)

	# Resample VAD to voting grid resolution for silence-aware voting
	vad_resampled = cls._resample_vad(vad_frames, num_frames)

	# Convert frames to segments
	final_segments = []
	current_speaker = -1
	seg_start = 0.0

	for f in range(num_frames):
	speaker = int(frame_speakers[f])
	score = max_votes[f]

	# Force silence if VAD says no speech OR no votes
	if score == 0 or not vad_resampled[f]:
	speaker = -1

	if speaker != current_speaker:
	if current_speaker != -1:
	final_segments.append(
	{
	"speaker": f"SPEAKER_{current_speaker}",
	"start": seg_start,
	"end": f * cls.VOTING_RATE,
	}
	)
	current_speaker = speaker
	seg_start = f * cls.VOTING_RATE

	# Close last segment
	if current_speaker != -1:
	final_segments.append(
	{
	"speaker": f"SPEAKER_{current_speaker}",
	"start": seg_start,
	"end": num_frames * cls.VOTING_RATE,
	}
	)

	return cls._merge_short_segments(final_segments)

	@classmethod
	def _merge_short_segments(cls, segments: list[dict]) -> list[dict]:
	"""Merge short segments to reduce flicker."""
	if not segments:
	return []

	clean: list[dict] = []
	for seg in segments:
	dur = seg["end"] - seg["start"]
	if dur < cls.MIN_SEGMENT_DURATION:
	if (
	clean
	and clean[-1]["speaker"] == seg["speaker"]
	and seg["start"] - clean[-1]["end"] < cls.SHORT_SEGMENT_GAP
	):
	clean[-1]["end"] = seg["end"]
	continue

	if (
	clean
	and clean[-1]["speaker"] == seg["speaker"]
	and seg["start"] - clean[-1]["end"] < cls.SAME_SPEAKER_GAP
	):
	clean[-1]["end"] = seg["end"]
	else:
	clean.append(seg)

	return clean

	@classmethod
	def assign_speakers_to_words(
	cls,
	words: list[dict],
	speaker_segments: list[dict],
	) -> list[dict]:
	"""Assign speaker labels to words based on timestamp overlap.

	Args:
	words: List of word dicts with 'word', 'start', 'end' keys
	speaker_segments: List of speaker dicts with 'speaker', 'start', 'end' keys

	Returns:
	Words list with 'speaker' key added to each word
	"""
	for word in words:
	word_mid = (word["start"] + word["end"]) / 2

	# Find the speaker segment that contains this word's midpoint
	best_speaker = None
	for seg in speaker_segments:
	if seg["start"] <= word_mid <= seg["end"]:
	best_speaker = seg["speaker"]
	break

	# If no exact match, find closest segment
	if best_speaker is None and speaker_segments:
	min_dist = float("inf")
	for seg in speaker_segments:
	seg_mid = (seg["start"] + seg["end"]) / 2
	dist = abs(word_mid - seg_mid)
	if dist < min_dist:
	min_dist = dist
	best_speaker = seg["speaker"]

	word["speaker"] = best_speaker

	return words


	class SpeakerDiarizer:
	"""Speaker diarization using TEN-VAD + ECAPA-TDNN + spectral clustering.

	Example:
	>>> segments = SpeakerDiarizer.diarize(audio_array)
	>>> for seg in segments:
	... print(f"{seg['speaker']}: {seg['start']:.2f} - {seg['end']:.2f}")
	"""

	@classmethod
	def diarize(
	cls,
	audio: np.ndarray \| str,
	sample_rate: int = 16000,
	num_speakers: int \| None = None,
	min_speakers: int \| None = None,
	max_speakers: int \| None = None,
	**_kwargs,
	) -> list[dict]:
	"""Run speaker diarization on audio.

	Args:
	audio: Audio waveform as numpy array or path to audio file
	sample_rate: Audio sample rate (default 16000)
	num_speakers: Exact number of speakers (if known)
	min_speakers: Minimum number of speakers
	max_speakers: Maximum number of speakers

	Returns:
	List of dicts with 'speaker', 'start', 'end' keys
	"""
	return LocalSpeakerDiarizer.diarize(
	audio,
	sample_rate=sample_rate,
	num_speakers=num_speakers,
	min_speakers=min_speakers or 2,
	max_speakers=max_speakers or 10,
	)

	@classmethod
	def assign_speakers_to_words(
	cls,
	words: list[dict],
	speaker_segments: list[dict],
	) -> list[dict]:
	"""Assign speaker labels to words based on timestamp overlap."""
	return LocalSpeakerDiarizer.assign_speakers_to_words(words, speaker_segments)