Xzzit / pytorch-tutorial

Introduction to PyTorch: A comprehensive Chinese course available at the provided link.
https://space.bilibili.com/12580263/channel/series
GNU General Public License v3.0
11 stars 4 forks source link

关于02_dataset中,数据格式为.csv文件的问题 #4

Closed Xzzit closed 1 year ago

Xzzit commented 1 year ago

提问者:B站用户Yoilime

问题:

  1. 数据集格式为csv文件,有8列特征,3923个数据。csv文件该如何转化为DataLoader。
  2. 如何定义神经网络。
  3. 如何定义损失函数。
Xzzit commented 1 year ago

回答:

  1. csv文件可以用pandas.read_csv命令加载,然后用numpy.array将dataframe对象转换为numpy数组,随后用torch.from_numpy()将numpy转化为tensor。然后自定义DataLoader。
  2. 这里不用CNN,因为输入数据的维度为1(比如身高、体重、年龄等只有一个数值,不像图像一样具有长宽两个维度),我们这里使用MLP(全连接)。
  3. 这里用MSE(最小方差)。

代码如下:

#导入需要使用的库
import numpy as np
import pandas as pd #读取csv文件的库
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

#读取数据到内存中,cable为一个dataframe对象
data_path ='2017_1.csv'
cable = pd.read_csv(data_path)

#将dataframe对象转换为numpy数组
cable_array = np.array(cable)
cable_array = cable_array[:, 1:] # 去掉第一列

#创建label
label = cable_array[:, 0]
label = torch.from_numpy(label).float()
cable_array = cable_array[:, 1:] # 去掉第一列

#对每一个维度进行归一化数据
for i in range(cable_array.shape[1]):
    cable_array[:, i] = cable_array[:, i] / np.max(cable_array[:, i])

#将numpy数组转换为tensor
cable_tensor = torch.from_numpy(cable_array).float()

#自定义Dataloader
class MyDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]

#创建dataset
cable_dataset = MyDataset(cable_tensor, label)

#创建dataloader
train_loader = DataLoader(cable_dataset, batch_size=16, shuffle=True)

class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.fc_1 = nn.Linear(5, 128)
        self.fc_2 = nn.Linear(128, 128)
        self.fc_3 = nn.Linear(128, 64)
        self.fc_4 = nn.Linear(64, 1)

    def forward(self, x):
        out = self.fc_1(x)
        out = self.fc_2(out)
        out = self.fc_3(out)
        out = self.fc_4(out)
        return out

#创建model模型
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model().to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

#训练模型
num_epochs = 50
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}\n-------------------------------')
    for idx, (data, label)in enumerate(train_loader):
        data, label = data.to(device), label.to(device)
        label = label.unsqueeze(1)

        pred = model(data)
        loss = loss_fn(pred, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch+1) % 10 == 0:
        correct = 0
        size = len(train_loader.dataset)
        with torch.no_grad():
            for img, label in train_loader:
                img, label = img.to(device), label.to(device)
                pred = model(img)

                for i in range(len(pred)):
                    if int(pred[i]) == int(label[i]):
                        correct += 1

        correct /= size
        print(f'Accuracy : {(100*correct):>0.1f}%')
Xzzit commented 1 year ago

更新:

  1. 将loss替换为crossentropy
  2. 引入batchnorm和relu
  3. 精读从27上升至50
#导入需要使用的库
import numpy as np
import pandas as pd #读取csv文件的库
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

#读取数据到内存中,cable为一个dataframe对象
data_path ='2017_1.csv'
cable = pd.read_csv(data_path)

#将dataframe对象转换为numpy数组
cable_array = np.array(cable)
cable_array = cable_array[:, 1:] # 去掉第一列

#创建label
label = cable_array[:, 0]
label = torch.from_numpy(label)
label = label - label.min()

cable_array = cable_array[:, 1:] # 去掉第一列

#对每一个维度进行归一化数据
for i in range(cable_array.shape[1]):
    cable_array[:, i] = cable_array[:, i] / np.max(cable_array[:, i])

#将numpy数组转换为tensor
cable_tensor = torch.from_numpy(cable_array).float()

#自定义Dataloader
class MyDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], int(self.label[idx])

#创建dataset
cable_dataset = MyDataset(cable_tensor, label)

#创建dataloader
train_loader = DataLoader(cable_dataset, batch_size=16, shuffle=True)

class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.fc_1 = nn.Sequential(
            nn.Linear(5, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )
        self.fc_2 = nn.Sequential(
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )
        self.fc_3 = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU()
        )
        self.fc_4 = nn.Linear(128, 42)

    def forward(self, x):
        out = self.fc_1(x)
        out = self.fc_2(out)
        out = self.fc_3(out)
        out = self.fc_4(out)
        return out

#创建model模型
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model().to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

#训练模型
num_epochs = 200
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}\n-------------------------------')
    for idx, (data, label)in enumerate(train_loader):
        data, label = data.to(device), label.to(device)

        pred = model(data)
        loss = loss_fn(pred, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        correct = 0
        size = len(train_loader.dataset)
        with torch.no_grad():
            for img, label in train_loader:
                img, label = img.to(device), label.to(device)
                pred = model(img)
                correct += (pred.argmax(1) == label).type(torch.float).sum().item()

        correct /= size
        print(f'Accuracy : {(100*correct):>0.1f}%')