Training, Tuning and Deploying a PyTorch Text Classification Model

import os
import random
import datasets
import numpy as np
import pandas as pd
import torch
import transformers

from IPython.display import HTML, display
from datasets import ClassLabel, Sequence, load_dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          EvalPrediction, Trainer, TrainingArguments,
                          default_data_collator)

print(f"Notebook runtime: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print(f"PyTorch version : {torch.__version__}")
print(f"Transformers version : {datasets.__version__}")
print(f"Datasets version : {transformers.__version__}")

APP_NAME = "finetuned-bert-classifier"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

dataset = load_dataset("imdb")

print(
    "Total # of rows in training dataset {} and size {:5.2f} MB".format(
        dataset["train"].shape[0], dataset["train"].size_in_bytes / (1024 * 1024)
    )
)
print(
    "Total # of rows in test dataset {} and size {:5.2f} MB".format(
        dataset["test"].shape[0], dataset["test"].size_in_bytes / (1024 * 1024)
    )
)

label_list = dataset["train"].unique("label")

def show_random_elements(dataset, num_examples=2):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(
                lambda x: [typ.feature.names[i] for i in x]
            )
    display(HTML(df.to_html()))

show_random_elements(dataset["train"])

print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("~~~~~~~~~~~~ ~~~~~~~~~~~~~~~")
print("~~~~~~~~~~~~ ~~~~~~~~~~~~~~~")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

batch_size = 16
max_seq_length = 128
model_name_or_path = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=True,)

tokenizer("Hello, this is one sentence!")
example = dataset["train"][4]
print(example)

tokenizer(
    ["Hello", ",", "this", "is", "one", "sentence", "split", "into", "words", "."],
    is_split_into_words=True,
)

# Dataset loading repeated here to make this cell idempotent
# Since we are over-writing datasets variable
dataset = load_dataset("imdb")
print("~~~~~8~~~~~~~~")
# Mapping labels to ids
# NOTE: We can extract this automatically but the `Unique` method of the datasets
# is not reporting the label -1 which shows up in the pre-processing.
# Hence the additional -1 term in the dictionary
label_to_id = {1: 1, 0: 0, -1: 0}


def preprocess_function(examples):
    """
    Tokenize the input example texts
    NOTE: The same preprocessing step(s) will be applied
    at the time of inference as well.
    """
    args = (examples["text"],)
    result = tokenizer(
        *args, padding="max_length", max_length=max_seq_length, truncation=True
    )

    # Map labels to IDs (not necessary for GLUE tasks)
    if label_to_id is not None and "label" in examples:
        result["label"] = [label_to_id[example] for example in examples["label"]]

    return result

print("~~~~~9~~~~~~~~")
# apply preprocessing function to input examples
dataset = dataset.map(preprocess_function, batched=True, load_from_cache_file=True)


model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path, num_labels=len(label_list)
)

args = TrainingArguments(
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    output_dir="/tmp/cls",
)

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

print("~~~~~10~~~~~~~~")

trainer = Trainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=default_data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("~~~~11~~~~~~~~")
trainer.train()
print("~~~~~12~~~~~~~~")
saved_model_local_path = "./models"

trainer.save_model(saved_model_local_path)
print("~~~~~13~~~~~~~~")
history = trainer.evaluate()
print("~~~~~14~~~~~~~~")
history

https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/community-content/pytorch_text_classification_using_vertex_sdk_and_gcloud/pytorch-text-classification-vertex-ai-train-tune-deploy.ipynb

已有 15 条评论

vdvhwvpbnz

November 12th, 2024 at 06:42 pm

你的文章让我心情愉悦，每天都要来看一看。 http://www.55baobei.com/GM5Ca2mGDT.html

回复
szlwfxunuf

December 8th, 2024 at 07:52 pm

你的文章充满了欢乐，让人忍不住一笑。 http://www.55baobei.com/7DdCtGETmf.html

回复
vyuogtrykt

January 6th, 2025 at 02:18 pm

哈哈哈，写的太好了https://www.lawjida.com/

回复
ihbqrifdlw

February 28th, 2025 at 05:24 pm

?批判性评语?

回复
tdcijvtkde

February 28th, 2025 at 06:43 pm

内容的丰富性和深度让人仿佛置身于知识的海洋，受益匪浅。

回复
laxfaixtfj

February 28th, 2025 at 09:29 pm

建议后续持续追踪此话题，形成系列研究。

回复
mvfjxrockk

March 1st, 2025 at 06:25 am

反讽手法运用娴熟，令人会心一笑。

回复
bxksuitqoc

March 1st, 2025 at 08:35 am

建议后续持续追踪此话题，形成系列研究。

回复
lkndlmbjdu

March 2nd, 2025 at 02:35 pm

建议后续持续追踪此话题，形成系列研究。

回复
mkybxwcmbi

March 2nd, 2025 at 05:10 pm

文章中的实用建议和操作指南，让读者受益匪浅，值得珍藏。

回复
pasvjwyxmc

March 2nd, 2025 at 05:37 pm

文章中的实用建议和操作指南，让读者受益匪浅，值得珍藏。

回复
jfnlnvxajd

March 2nd, 2025 at 08:40 pm

立意高远，以小见大，引发读者对社会/人性的深层共鸣。

回复
khexirzaps

March 4th, 2025 at 03:19 pm

立意高远，以小见大，引发读者对社会/人性的深层共鸣。

回复
rjpshszgmv

March 7th, 2025 at 12:34 am

这篇文章不错！

回复
mpcxrsiqke

October 5th, 2025 at 03:49 am

新车即将上线真正的项目，期待你的参与coinsrore.com

回复

Training, Tuning and Deploying a PyTorch Text Classification Model

已有 15 条评论

添加新评论

最新文章

最近回复

分类

归档

其它