import random

import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
import torch
from torch.nn.functional import cross_entropy
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler
)

# a seed for reproducibility
my_seed = 123456789
random.seed(my_seed) # ordinary python
np.random.seed(my_seed) # numpy
torch.manual_seed(my_seed) # torch

# select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}") # 

# this is done because:
# - GPU: better for many calculations at the same time (games, AI models - first choice)
# - CPU: better for single operations, that's why it's secondary choice

device: cpu

# load amazon reviews dataset (english, 5 classes not 2)
dataset = load_dataset("SetFit/amazon_reviews_multi_en")

# this dataset comes with predefined train/validation/test splits
# the load_dataset() function detects these files and automatically loads them into a DatasetDict

print(dataset)
print(dataset["train"].features)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 5000
    })
})
{'id': Value('string'), 'text': Value('string'), 'label': Value('int64'), 'label_text': Value('string')}

# preview a few examples
for i in range(3):
    example = dataset["train"][i]
    print(f"id: {example['id']}")
    print(f"label: {example['label']} ({example['label_text']})")
    print(f"text: {example['text'][:300]}...")
    print("-" * 40)

id: en_0964290
label: 0 (0)
text: Arrived broken. Manufacturer defect. Two of the legs of the base were not completely formed, so there was no way to insert the casters. I unpackaged the entire chair and hardware before noticing this. So, I'll spend twice the amount of time boxing up the whole useless thing and send it back with a 1...
----------------------------------------
id: en_0690095
label: 0 (0)
text: the cabinet dot were all detached from backing... got me...
----------------------------------------
id: en_0311558
label: 0 (0)
text: I received my first order of this product and it was broke so I ordered it again. The second one was broke in more places than the first. I can't blame the shipping process as it's shrink wrapped and boxed....
----------------------------------------

# class distribution in train set
df_train = pd.DataFrame(dataset["train"])
print("class distribution (train):")
print(df_train["label_text"].value_counts())

class distribution (train):
label_text
0    40000
1    40000
2    40000
3    40000
4    40000
Name: count, dtype: int64

# convert training split to pandas dataframe for cleaning
train_df = pd.DataFrame(dataset["train"])

# drop rows with NaN in 'text'
train_df = train_df.dropna(subset=["text"])

# drop rows with empty text after stripping whitespace
train_df = train_df[train_df["text"].str.strip().str.len() > 0]

print(f"remaining records after cleaning: {len(train_df)}")

# convert back to Hugging Face Dataset
dataset["train"] = Dataset.from_pandas(train_df, preserve_index=False)

remaining records after cleaning: 200000

# CHATGPT HELPED ME DEAL WITH THIS PROBLEM (Value instead of ClassLabel)

# convert train split to pandas
df_train = pd.DataFrame(dataset["train"])

# stratified sample down to 20k rows
df_small, _ = train_test_split(
    df_train,
    train_size=20000,
    stratify=df_train["label"],
    random_state=my_seed
)

# convert back to HF Dataset
from datasets import Dataset
dataset["train"] = Dataset.from_pandas(df_small, preserve_index=False)

print("train size reduced to:", dataset["train"].num_rows)
print(df_small["label"].value_counts())

train size reduced to: 20000
label
3    4000
2    4000
1    4000
4    4000
0    4000
Name: count, dtype: int64

# choose tokenizer for length analysis
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # DistilBERT tokenizer; lowercase text
# I want it simplified, but I'm aware that uppercase "TERRIBLE" may emphasise disappointment,
# while lowercasing makes me lose that additional piece of information

# take a sample of texts to estimate length distribution
sample_texts = dataset["train"]["text"][:5000]
lengths = [len(tokenizer.encode(t, truncation=False)) for t in sample_texts] # here I don't truncate as I want the EXACT whole number of tokens

# basic stats
print(f"mean length: {np.mean(lengths):.1f} tokens")
print(f"95th percentile: {np.percentile(lengths, 95)} tokens")
print(f"max length: {np.max(lengths)} tokens")

E:\anaconda\Lib\site-packages\huggingface_hub\file_download.py:945: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Token indices sequence length is longer than the specified maximum sequence length for this model (683 > 512). Running this sequence through the model will result in indexing errors

mean length: 44.1 tokens
95th percentile: 120.0 tokens
max length: 683 tokens

# when I checked it on the whole dataset, the results were quite similar, 95% was 110 tokens and mean lenght was slightly bigger

# just to check how tokenization works:

print(tokenizer.encode("I love cats", truncation=False)) # 101 and 102 are typical to BERT

[101, 1045, 2293, 8870, 102]

# configuraition of my tokenizer
# tokenizer = tokenizer 

max_length = 128 # (!) from length analysis above
# more than 128 tokens are going to be truncated while if there's less than that, padding is going to be performed

# function to tokenize batches of examples
def tokenize_batch(batch):
    # użyj tokenizera na kolumnie 'text'
    encoded = tokenizer(
        batch["text"], # list of all "texts"
        max_length=max_length,
        truncation=True, # cut longer sequences
        padding="max_length" # pad shorter sequences to max_length
    )
    return encoded # encoded is a BatchEncoding object which is tantamount to a dictionary

# it's going to look like this:
# {
#  "input_ids": [[101, 2026, 2171, ...], [101, 1045, 2293, ...], ...],
#  "attention_mask": [[1, 1, 1, ...], [1, 1, 1, ...], ...]
# }
# so each token get's value 1 if it's not padded and 0 if padded

# ALL I NEED IS ENCODED TEXT TOGETHER WITH MASKS + LABELS TO CALCULATE LOSS FUNCTION

# apply tokenization to each split
dataset_tokenized = dataset.map(
    tokenize_batch,
    batched=True, # process in batches for speed
    remove_columns=["text", "label_text", "id"] # so I don't need text itself, label as text and id 
)

print(dataset_tokenized)

# it's a HF DatasetDict object which means:
# - KEY: train/test/val
# - VALUE: a HF Dataset object which includes features: ['label', 'input_ids', 'attention_mask'] & num_rows: 5000

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

# setting torch format for hf datasets
dataset_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
) # it must be a torch.Tensor not a list!

# small cpu-friendly batch size
batch_size = 8

# "windows tip: keep num_workers=0"

train_loader = DataLoader(
    dataset_tokenized["train"], 
    batch_size=batch_size,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    dataset_tokenized["validation"],
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

test_loader = DataLoader(
    dataset_tokenized["test"],
    batch_size=batch_size,
    shuffle=False,
    num_workers=0
)

print("ready: dataloaders ->",
      "train:", len(train_loader),
      "val:", len(val_loader),
      "test:", len(test_loader))

ready: dataloaders -> train: 2500 val: 625 test: 625

# detect device automatically
# device = device (defined at the beginning, cpu here)

# number of output classes
num_labels = 5

# loading pretrained DistilBERT with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

E:\anaconda\Lib\site-packages\huggingface_hub\file_download.py:945: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

# moving model to the chosen device
model.to(device)

# confirming that the model is loaded and on a correct device
print(model.device)

cpu

# learning rate 
learning_rate = 5e-5 # transformers need smaller values than these used in simple algorithms
weight_decay = 0.01 # I found it in the internet

# AdamW (Weight Decoupled Adam) makes it possible to apply weight decay (L2 regularisation)
optimizer = AdamW(
    model.parameters(),
    lr=learning_rate,
    weight_decay=weight_decay
)

# number of batches in the whole training
num_epochs = 2 # not much (cpu)
num_training_steps = num_epochs * len(train_loader) # len(train_loader) = 20000 / 8 = 2500 (so 2*2500 epochs)
# it means model goes through the whole dataset in batches TWICE if there are 2 epochs

# scheduler makes a schedule for training: we start with learning rate and let it decay to 0 gradually
# a learning rate indicates the "leap" of weights when approaching a minimum
scheduler = get_scheduler(
    name="linear", # linear slump
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(f"training steps: {num_training_steps}")

training steps: 5000

# training loop by ChatGPT
for epoch in range(num_epochs):
    print(f"epoch {epoch+1}/{num_epochs}")

    # --- training phase ---
    model.train()  # set model to training mode
    train_losses = []

    for batch in train_loader:
        # move batch to device (cpu or gpu)
        batch = {k: v.to(device) for k, v in batch.items()}

        # forward pass -> logits
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["label"]
        )

        loss = outputs.loss  # loss for this batch
        train_losses.append(loss.item())

        # backward pass
        loss.backward()

        # update model weights
        optimizer.step()
        scheduler.step()  # adjust learning rate
        optimizer.zero_grad()  # clear gradients

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f"train loss: {avg_train_loss:.4f}")

    # --- validation phase ---
    model.eval()  # set model to evaluation mode
    val_labels = []
    val_preds = []
    val_losses = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["label"]
            )
            loss = outputs.loss
            val_losses.append(loss.item())

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            val_labels.extend(batch["label"].cpu().numpy())
            val_preds.extend(preds.cpu().numpy())

    avg_val_loss = sum(val_losses) / len(val_losses)
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average="macro")

    print(f"val loss: {avg_val_loss:.4f} | val acc: {val_acc:.4f} | val macro-F1: {val_f1:.4f}")
    print("-" * 50)

epoch 1/2
train loss: 1.1056
val loss: 1.0578 | val acc: 0.5428 | val macro-F1: 0.5325
--------------------------------------------------
epoch 2/2
train loss: 0.8420
val loss: 1.0537 | val acc: 0.5534 | val macro-F1: 0.5524
--------------------------------------------------

# evaluation on test set
model.eval()
test_labels = []
test_preds = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        test_labels.extend(batch["label"].cpu().numpy())
        test_preds.extend(preds.cpu().numpy())

# accuracy + macro-F1
test_acc = accuracy_score(test_labels, test_preds)
test_f1 = f1_score(test_labels, test_preds, average="macro")

print(f"test acc: {test_acc:.4f} | test macro-F1: {test_f1:.4f}")

# detailed report
print(classification_report(test_labels, test_preds, target_names=[f"{i+1}★" for i in range(5)]))

test acc: 0.5660 | test macro-F1: 0.5663
              precision    recall  f1-score   support

          1★       0.67      0.66      0.67      1000
          2★       0.46      0.47      0.46      1000
          3★       0.43      0.46      0.45      1000
          4★       0.54      0.50      0.52      1000
          5★       0.73      0.74      0.73      1000

    accuracy                           0.57      5000
   macro avg       0.57      0.57      0.57      5000
weighted avg       0.57      0.57      0.57      5000

save_dir = "E:\\NLP_PROJECT"  # podwójne backslashe

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model i tokenizer zapisane w: {save_dir}")

Model i tokenizer zapisane w: E:\NLP_PROJECT

def predict_review(text):
    model.eval()
    with torch.no_grad():
        # tokenize the new text
        encoded = tokenizer(
            text,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # move to device
        encoded = {k: v.to(device) for k, v in encoded.items()}

        # forward pass
        outputs = model(**encoded)
        logits = outputs.logits
        pred_label = torch.argmax(logits, dim=-1).item()

    # map label to stars
    return f"Predicted rating: {pred_label + 1}★"

# example usage
print(predict_review("The chair arrived broken and missing screws. Terrible quality!")) # 1★
print(predict_review("Very good! Fast delivery and great quality but I had to wait for it longer than expected.")) # that's more difficult, 4★

Predicted rating: 1★
Predicted rating: 4★

Amazon Reviews Sentiment Classification with DistilBERT (5-Star Ratings)¶

Author: Mateusz Kowalski¶

Date: 08/2025¶

Task overview¶

Data quality & preprocessing¶

Plan (CPU-friendly)¶

Why simplified?¶

Library imports¶

Seed & device set-up¶

Very important: running on CPU means computations will be slower, especially during training, so I'll keep the dataset small and reduce model/batch size to make the project feasible.¶

Quick data preview¶

NaNs¶

Now I need to see how many tokens are there on average to truncate later. Truncation is done in order to simplify computations.¶

Tokenizer¶

Tokenization¶

Dataloaders¶

Loading DistilBERT model¶

Training¶

Test evaluation¶

New text function¶

Conclusions¶

Interpretation of 57% accuracy (exact match) for 5-class star ratings¶

Key takeaways:¶