Text Classification
Transformers
Safetensors
Bengali
electra
bangla
bangla-classifier
binary-classifier
text-classifier
Instructions to use SayedShaun/bangla-classifier-binary with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use SayedShaun/bangla-classifier-binary with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="SayedShaun/bangla-classifier-binary")# Load model directly from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("SayedShaun/bangla-classifier-binary") model = AutoModelForSequenceClassification.from_pretrained("SayedShaun/bangla-classifier-binary") - Notebooks
- Google Colab
- Kaggle
| from datasets import load_dataset, Dataset | |
| import random | |
| import numpy as np | |
| from transformers import ( | |
| AutoTokenizer, | |
| DataCollatorWithPadding, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| PreTrainedTokenizer, | |
| ElectraForSequenceClassification, | |
| EarlyStoppingCallback | |
| ) | |
| from dataclasses import dataclass | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| def process(batch: dict, tokenizer: PreTrainedTokenizer) -> dict: | |
| # SP and WP = Positive | WN and SN = Negative | |
| # NU should randomly be Positive or Negative | |
| new_labels = [] | |
| for label in batch["Polarity"]: | |
| if label in ["SP", "WP"]: | |
| new_labels.append(1) | |
| elif label in ["WN", "SN"]: | |
| new_labels.append(0) | |
| elif label == "NU": | |
| new_labels.append(random.choice([1, 0])) | |
| else: | |
| new_labels.append(label) | |
| inputs = tokenizer(batch["Text"], truncation=True) | |
| batch["input_ids"] = inputs["input_ids"] | |
| batch["attention_mask"] = inputs["attention_mask"] | |
| batch["labels"] = new_labels | |
| return batch | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = logits.argmax(-1) | |
| accuracy = accuracy_score(labels, predictions) | |
| precision, recall, f1, _ = precision_recall_fscore_support( | |
| labels, predictions, average='binary' | |
| ) | |
| return { | |
| "accuracy": accuracy, | |
| "precision": precision, | |
| "recall": recall, | |
| "f1": f1, | |
| } | |
| def pipeline(args): | |
| model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2) | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |
| dataset = load_dataset(args.dataset_name) | |
| dataset = dataset.map(process, batched=True, fn_kwargs={'tokenizer': tokenizer}) | |
| dataset = dataset["train"].train_test_split(args.split_ratio) | |
| train_dataset = dataset["train"] | |
| test_dataset = dataset["test"] | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| trainer = Trainer( | |
| model=model, | |
| args=TrainingArguments( | |
| output_dir="./results", | |
| learning_rate=args.learning_rate, | |
| per_device_train_batch_size=args.batch_size, | |
| per_device_eval_batch_size=args.batch_size, | |
| num_train_epochs=args.epochs, | |
| weight_decay=0.01, | |
| eval_strategy="steps", | |
| save_strategy="steps", | |
| load_best_model_at_end=True, | |
| report_to="none", | |
| save_steps=500, | |
| eval_steps=500, | |
| save_total_limit=1, | |
| logging_steps=500, | |
| fp16=args.fp16, | |
| greater_is_better=True, | |
| metric_for_best_model="f1", | |
| ), | |
| train_dataset=train_dataset, | |
| eval_dataset=test_dataset, | |
| processing_class=tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics, | |
| callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] | |
| ) | |
| trainer.train() | |
| trainer.evaluate() | |
| trainer.predict(test_dataset) | |
| # Push to Hub | |
| trainer.push_to_hub(args.hub_location) | |
| tokenizer.push_to_hub(args.hub_location) | |
| class Arguments: | |
| model_name: str = "csebuetnlp/banglabert" | |
| dataset_name: str = "SayedShaun/sentigold" | |
| split_ratio: float = 0.1 | |
| batch_size: int = 128 | |
| epochs: int = 40 | |
| learning_rate: float = 1e-5 | |
| fp16: bool = True | |
| hub_location: str = "SayedShaun/bangla-classifier-binary" | |
| if __name__=="__main__": | |
| args = Arguments() | |
| pipeline(args) |