import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
newsgroups_data = fetch_20newsgroups(subset='all')
texts, labels = newsgroups_data.data, newsgroups_data.target
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
# 학습: 8, 테스트: 2로 분리
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2024)
# 텍스트 데이터를 벡터화
# r'\b\w+\b'
# r: \해석하지 않게 함
# \b: 단어의 시작 또는 끝을 의미
# \w: 단어 문자 또는 숫자를 의미
vectorizer = CountVectorizer(max_features=10000, token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
# 파이토치 텐서로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
# 데이터셋 클래스 정의
class NewsGroupDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_dataset = NewsGroupDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsGroupDataset(X_test_tensor, y_test_tensor)
len(train_dataset)
15076
train_dataset[0]
(tensor([0., 0., 0., ..., 0., 0., 0.]), tensor(11))
# 데이터로더 생성
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
# RNN 모델
class RNNModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(RNNModel, self).__init__()
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h = torch.zeros(1, x.size(0), self.hidden_size).to(x.device);
out, _ = self.rnn(x, h)
out = self.fc(out[:, -1, :])
return out
input_size = 10000
hidden_size = 128
output_size = len(label_encoder.classes_)
num_layers = 1
model = RNNModel(input_size, hidden_size, output_size, num_layers)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
# 학습
# 모델 평가
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
# X_batch, y_batch = X_batch.to(model.device), y_batch.to(model.device)
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
loss = loss_fun(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
Epoch: 1/10, Loss: 0.3300 Epoch: 2/10, Loss: 0.2793 Epoch: 3/10, Loss: 0.1355 Epoch: 4/10, Loss: 0.0794 Epoch: 5/10, Loss: 0.0059 Epoch: 6/10, Loss: 0.0399 Epoch: 7/10, Loss: 0.0057 Epoch: 8/10, Loss: 0.0090 Epoch: 9/10, Loss: 0.0033 Epoch: 10/10, Loss: 0.0024
# 검증
model.eval()
y_test, y_pred = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
_, pred = torch.max(outputs, 1)
y_test.extend(y_batch.numpy())
y_pred.extend(pred.numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
accuracy: 0.9016
1. LSTM(Long Short-Term Memory)¶
- 바닐라 RNN은 시퀀스 데이터를 처리할 때 시간이 지남에 따라 정보가 소실되거나 기울기 소실 문제가 발생
- 순환 신경망(RNN)의 한 종류로, 긴 시퀀스 데이터를 효과적으로 학습할 수 있도록 고안된 구조
1-1. LSTM의 구조¶
- 입력 게이트: 현재 입력값과 이전의 은닉 상태를 사용하여 어떤 정보를 새롭게 저장할지 결정
- 망각 게이트: 현재 입력값과 이전의 은닉 상태를 사용하여 어떤 정보를 잊을지 결정
- 출력 게이트: 현재 입력값과 이전의 은닉 상태를 사용하여 다음 은닉 상태를 결정
- 셀 상태: 정보가 직접 흐르는 경로로, 정보가 소실되지 않도록 함
1-2. LSTM으로 예제 변환¶
newsgroups_data = fetch_20newsgroups(subset='all')
texts, labels = newsgroups_data.data, newsgroups_data.target
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2024)
vectorizer = CountVectorizer(max_features=10000, token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
class NewsGroupDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_dataset = NewsGroupDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsGroupDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(LSTMModel, self).__init__()
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h = torch.zeros(1, x.size(0), self.hidden_size).to(x.device);
c = torch.zeros(1, x.size(0), self.hidden_size).to(x.device);
out, _ = self.lstm(x, (h, c))
out = self.fc(out[:, -1, :])
return out
input_size = 10000
hidden_size = 128
output_size = len(label_encoder.classes_)
num_layers = 1
model = LSTMModel(input_size, hidden_size, output_size, num_layers)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
loss = loss_fun(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
Epoch: 1/10, Loss: 0.5123 Epoch: 2/10, Loss: 0.3091 Epoch: 3/10, Loss: 0.0286 Epoch: 4/10, Loss: 0.0396 Epoch: 5/10, Loss: 0.0090 Epoch: 6/10, Loss: 0.0085 Epoch: 7/10, Loss: 0.0573 Epoch: 8/10, Loss: 0.0133 Epoch: 9/10, Loss: 0.0049 Epoch: 10/10, Loss: 0.0007
model.eval()
y_test, y_pred = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
_, pred = torch.max(outputs, 1)
y_test.extend(y_batch.numpy())
y_pred.extend(pred.numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
accuracy: 0.9003
2. GRU(Gated Recurrent Unit)¶
- LSTM과 유사하지만 구조가 더 간단한 RNN의 한 종류
- LSTM과 달리 셀 상태(cell state)를 가지지 않으며, 업데이트 게이트와 리셋 게이트를 사용하여 정보를 처리
newsgroups_data = fetch_20newsgroups(subset='all')
texts, labels = newsgroups_data.data, newsgroups_data.target
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2024)
vectorizer = CountVectorizer(max_features=10000, token_pattern=r'\b\w+\b')
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
class NewsGroupDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
train_dataset = NewsGroupDataset(X_train_tensor, y_train_tensor)
test_dataset = NewsGroupDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
class GRUModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(GRUModel, self).__init__()
self.hidden_size = hidden_size
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h = torch.zeros(1, x.size(0), self.hidden_size).to(x.device);
out, _ = self.gru(x, h)
out = self.fc(out[:, -1, :])
return out
input_size = 10000
hidden_size = 128
output_size = len(label_encoder.classes_)
num_layers = 1
model = GRUModel(input_size, hidden_size, output_size, num_layers)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for X_batch, y_batch in train_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
loss = loss_fun(outputs, y_batch)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')
model.eval()
y_test, y_pred = [], []
with torch.no_grad():
for X_batch, y_batch in test_loader:
X_batch = X_batch.unsqueeze(1)
outputs = model(X_batch)
_, pred = torch.max(outputs, 1)
y_test.extend(y_batch.numpy())
y_pred.extend(pred.numpy())
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.4f}')
Epoch: 1/10, Loss: 0.5628 Epoch: 2/10, Loss: 0.2175 Epoch: 3/10, Loss: 0.0173 Epoch: 4/10, Loss: 0.0675 Epoch: 5/10, Loss: 0.0116 Epoch: 6/10, Loss: 0.0050 Epoch: 7/10, Loss: 0.0512 Epoch: 8/10, Loss: 0.0016 Epoch: 9/10, Loss: 0.0018 Epoch: 10/10, Loss: 0.0011 accuracy: 0.9024
3. LSTM vs GRU¶
LSTM과 GRU는 RNN의 기울기 소실 단점을 해결하기 위해 고안
게이트 메커니즘을 사용하여 중요한 정보를 유지하고 불필요한 정보를 제거
긴 시퀀스를 효과적으로 처리할 수 있어서 많은 자연어 처리 작업에서 사용
LSTM
- 게이트 수: 3개(입력, 망각, 출력)
- 셀 상태를 유지
- 구조가 복잡함
- 매개변수가 GRU보다 많음
- 훈련시간이 GRU보다 오래 걸림
GRU
- 게이트 수: 2개(업데이트, 리셋)
- 셀 상태가 없음
- 구조는 단순함
- 매개변수가 LSTM보다 작음
- 훈련시간이 LSTM보다 적게 걸림
'코딩 > 자연어 처리' 카테고리의 다른 글
cbow text classification (0) | 2024.07.18 |
---|---|
워드 임베딩 (0) | 2024.07.18 |
임베딩 (0) | 2024.07.18 |
문장 임베딩 (0) | 2024.07.18 |
CNN text classification (0) | 2024.07.18 |