import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
urllib.request.urlretrieve('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', filename='ratings_train.txt' )
urllib.request.urlretrieve('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', filename='ratings_test.txt' )
('ratings_test.txt', <http.client.HTTPMessage at 0x7e28ba41d0c0>)
train_dataset = pd.read_table('ratings_train.txt')
train_dataset
id | document | label | |
---|---|---|---|
0 | 9976970 | 아 더빙.. 진짜 짜증나네요 목소리 | 0 |
1 | 3819312 | 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 | 1 |
2 | 10265843 | 너무재밓었다그래서보는것을추천한다 | 0 |
3 | 9045019 | 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 | 0 |
4 | 6483659 | 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... | 1 |
... | ... | ... | ... |
149995 | 6222902 | 인간이 문제지.. 소는 뭔죄인가.. | 0 |
149996 | 8549745 | 평점이 너무 낮아서... | 1 |
149997 | 9311800 | 이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다? | 0 |
149998 | 2376369 | 청춘 영화의 최고봉.방황과 우울했던 날들의 자화상 | 1 |
149999 | 9619869 | 한국 영화 최초로 수간하는 내용이 담긴 영화 | 0 |
150000 rows × 3 columns
# pos, neg 비율
train_dataset['label'].value_counts()
label 0 75173 1 74827 Name: count, dtype: int64
sum(train_dataset['document'].isnull())
5
~train_dataset['document'].isnull()
0 True 1 True 2 True 3 True 4 True ... 149995 True 149996 True 149997 True 149998 True 149999 True Name: document, Length: 150000, dtype: bool
train_dataset = train_dataset[~train_dataset['document'].isnull()]
sum(train_dataset['document'].isnull())
0
train_dataset
id | document | label | |
---|---|---|---|
0 | 9976970 | 아 더빙.. 진짜 짜증나네요 목소리 | 0 |
1 | 3819312 | 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 | 1 |
2 | 10265843 | 너무재밓었다그래서보는것을추천한다 | 0 |
3 | 9045019 | 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 | 0 |
4 | 6483659 | 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... | 1 |
... | ... | ... | ... |
149995 | 6222902 | 인간이 문제지.. 소는 뭔죄인가.. | 0 |
149996 | 8549745 | 평점이 너무 낮아서... | 1 |
149997 | 9311800 | 이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다? | 0 |
149998 | 2376369 | 청춘 영화의 최고봉.방황과 우울했던 날들의 자화상 | 1 |
149999 | 9619869 | 한국 영화 최초로 수간하는 내용이 담긴 영화 | 0 |
149995 rows × 3 columns
Tokenization¶
- 자연어를 모델이 이해하기 위해서는 자연어를 숫자의 형시으로 변형 시켜야 함
train_dataset['document'].iloc[0].split()
['아', '더빙..', '진짜', '짜증나네요', '목소리']
vocab = set()
for doc in train_dataset['document']:
for token in doc.split():
vocab.add(token)
len(vocab)
357862
# 단어의 빈도수 구하기
'''
[('아', 1024),
('더빙', 2),
('진짜', 5929),
...]
'''
vocab_cnt_dict = {}
for doc in train_dataset['document']:
for token in doc.split():
if token not in vocab_cnt_dict:
vocab_cnt_dict[token] = 0
vocab_cnt_dict[token] += 1
vocab_cnt_dict
{'아': 1204, '더빙..': 2, '진짜': 5929, '짜증나네요': 10, '목소리': 99, '흠...포스터보고': 1, '초딩영화줄....오버연기조차': 1, '가볍지': 17, '않구나': 2, '너무재밓었다그래서보는것을추천한다': 1, '교도소': 4, '이야기구먼': 1, ...}
vocab_cnt_list = [(token, cnt) for token, cnt in vocab_cnt_dict.items()]
vocab_cnt_list[:10]
[('아', 1204), ('더빙..', 2), ('진짜', 5929), ('짜증나네요', 10), ('목소리', 99), ('흠...포스터보고', 1), ('초딩영화줄....오버연기조차', 1), ('가볍지', 17), ('않구나', 2), ('너무재밓었다그래서보는것을추천한다', 1)]
top_vocabs = sorted(vocab_cnt_list, key=lambda tup: tup[1], reverse=True)
top_vocabs[:10]
[('영화', 10825), ('너무', 8239), ('정말', 7791), ('진짜', 5929), ('이', 5059), ('영화.', 3598), ('왜', 3285), ('더', 3260), ('이런', 3249), ('그냥', 3237)]
cnts = [cnt for _, cnt in top_vocabs]
cnts
[10825, 8239, 7791, 5929, 5059, ...]
np.mean(cnts)
3.1792590439890236
cnts[:10]
[10825, 8239, 7791, 5929, 5059, 3598, 3285, 3260, 3249, 3237]
sum(np.array(cnts) > 2)
42635
n_vocab = sum(np.array(cnts) > 2)
n_vocab
42635
top_vocabs_truncated = top_vocabs[:n_vocab]
top_vocabs_truncated[:5]
[('영화', 10825), ('너무', 8239), ('정말', 7791), ('진짜', 5929), ('이', 5059)]
vocabs = [token for token, _ in top_vocabs_truncated]
vocabs[:5]
['영화', '너무', '정말', '진짜', '이']
special token¶
- [UNK]: Unknown token
- [PAD]: Padding token
unk_token = '[UNK]'
unk_token in vocabs
False
pad_token = '[PAD]'
pad_token in vocabs
False
vocabs.insert(0, unk_token)
vocabs.insert(0, pad_token)
vocabs[:5]
['[PAD]', '[UNK]', '영화', '너무', '정말']
idx_to_token = vocabs
token_to_idx = {token: i for i, token in enumerate(idx_to_token)}
class Tokenizer:
def __init__(self, vocabs, use_padding=True, max_padding=64, pad_token='[PAD]', unk_token='[UNK]'):
self.idx_to_token = vocabs
self.token_to_idx = {token: i for i, token in enumerate(self.idx_to_token)}
self.use_padding = use_padding
self.max_padding = max_padding
self.pad_token = pad_token
self.unk_token = unk_token
self.unk_token_idx = self.token_to_idx[self.unk_token]
self.pad_token_idx = self.token_to_idx[self.pad_token]
def __call__(self, x:str):
token_ids = []
token_list = x.split()
for token in token_list:
if token in self.token_to_idx:
token_idx = self.token_to_idx[token]
else:
token_idx = self.unk_token_idx
token_ids.append(token_idx)
if self.use_padding:
token_ids = token_ids[:self.max_padding]
n_pads = self.max_padding - len(token_ids)
token_ids = token_ids + [self.pad_token_idx] * n_pads
return token_ids
tokenizer = Tokenizer(vocabs, use_padding=False)
sample = train_dataset['document'].iloc[0]
print(sample)
아 더빙.. 진짜 짜증나네요 목소리
tokenizer(sample) # [51, 1, 5, 10485, 1064]
[51, 1, 5, 10485, 1064]
token_length_list = []
for sample in train_dataset['document']:
token_length_list.append(len(tokenizer(sample)))
plt.hist(token_length_list)
plt.xlabel('token length')
plt.ylabel('count')
Text(0, 0.5, 'count')
max(token_length_list)
41
tokenizer = Tokenizer(vocabs, use_padding=True, max_padding=50)
print(tokenizer(sample))
[201, 2, 3635, 1, 121, 1946, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
train_valid_dataset = pd.read_table('ratings_train.txt')
test_dataset = pd.read_table('ratings_test.txt')
print(f'train, valid samples: {len(train_valid_dataset)}')
print(f'test samples: {len(test_dataset)}')
train, valid samples: 150000 test samples: 50000
train_valid_dataset.head()
id | document | label | |
---|---|---|---|
0 | 9976970 | 아 더빙.. 진짜 짜증나네요 목소리 | 0 |
1 | 3819312 | 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나 | 1 |
2 | 10265843 | 너무재밓었다그래서보는것을추천한다 | 0 |
3 | 9045019 | 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정 | 0 |
4 | 6483659 | 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ... | 1 |
train_valid_dataset = train_valid_dataset.sample(frac=1)
train_valid_dataset.head()
id | document | label | |
---|---|---|---|
62817 | 4384598 | 전체적으로 보면 | 1 |
143327 | 6811570 | 1~2회 보다가 예전에 방영한 내 여자친구는 구미호 느낌에 오버랩이 된다고나 할까.... | 0 |
78075 | 9312057 | 이 세상에 진정한 용서란 없다 용서는있고 반성은 없는 지금의 대한민국 | 1 |
133601 | 9497628 | 시작할때 동성애부분이 조금 불편했지만정말 편견을 버리게 해주는 좋은 작품미다. | 1 |
36115 | 9297049 | 이런 작품이 많이 늘어나면 세상이 밝아질텐데 라는 생각을 했습니다^^ 감우성님은 명... | 1 |
train_tatio = 0.8
n_train = int(len(train_valid_dataset) * train_tatio)
train_df = train_valid_dataset[:n_train]
valid_df = train_valid_dataset[n_train:]
test_df = test_dataset
print(f'train, valid samples: {len(valid_df)}')
print(f'train, valid samples: {len(train_df)}')
print(f'test samples: {len(test_df)}')
train, valid samples: 30000 train, valid samples: 120000 test samples: 50000
# 1/10으로 샘플링
train_df = train_df.sample(frac=1)
valid_df = valid_df.sample(frac=1)
test_df = test_df.sample(frac=1)
print(f'train, valid samples: {len(valid_df)}')
print(f'train, valid samples: {len(train_df)}')
print(f'test samples: {len(test_df)}')
train, valid samples: 30000 train, valid samples: 120000 test samples: 50000
class NSMCDataset(Dataset):
def __init__(self, data_df, tokenizer=None):
self.data_df = data_df
self.tokenizer = tokenizer
def __len__(self):
return len(self.data_df)
def __getitem__(self, idx):
sample_raw = self.data_df.iloc[idx]
sample = {}
sample['doc'] = str(sample_raw['document'])
sample['label'] = int(sample_raw['label'])
if self.tokenizer is not None:
sample['doc_ids'] = self.tokenizer(sample['doc'])
return sample
train_dataset = NSMCDataset(data_df=train_df, tokenizer=tokenizer)
valid_dataset = NSMCDataset(data_df=valid_df, tokenizer=tokenizer)
test_dataset = NSMCDataset(data_df=test_df, tokenizer=tokenizer)
print(train_dataset[0])
{'doc': '대한민국 성우들 다 죽여라. 가수를 성우로 쓰고 자빠졌네..', 'label': 0, 'doc_ids': [824, 6370, 15, 28639, 29116, 18664, 1790, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
def collate_fn(batch):
keys = [key for key in batch[0].keys()]
data = {key: [] for key in keys}
for item in batch:
for key in keys:
data[key].append(item[key])
return data
train_dataloader = DataLoader(
train_dataset,
batch_size=128,
collate_fn=collate_fn,
shuffle=True,
)
valid_dataloader = DataLoader(
valid_dataset,
batch_size=128,
collate_fn=collate_fn,
shuffle=False,
)
test_dataloader = DataLoader(
test_dataset,
batch_size=128,
collate_fn=collate_fn,
shuffle=False,
)
sample = next(iter(train_dataloader))
sample.keys() # dict_keys(['doc', 'label', 'doc_ids'])
dict_keys(['doc', 'label', 'doc_ids'])
sample['doc'][2] # 정말 재미지게 오랫동안 보게되는 드라마
'지루함반... 하품반...'
print(sample['doc_ids'][2]) # [4, 17366, 2223, 2798, 52, 0, 0, ... 0]
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
CNN Model¶
class SentenceCNN(nn.Module):
def __init__(self, vocab_size, embed_dim, word_win_size=[3, 5, 7]):
super().__init__()
self.vocab_size = vocab_size
self.embed_dim = embed_dim
self.word_win_size = word_win_size
self.conv_list = nn.ModuleList(
[nn.Conv2d(1, 1, kernel_size=(w, embed_dim)) for w in self.word_win_size]
)
self.embeddings = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.output_dim = len(self.word_win_size)
def forward(self, X):
batch_size, seq_len = X.size()
X = self.embeddings(X) # batch_size * seq_len * embed_dim
X = X.view(batch_size, 1, seq_len, self.embed_dim) # batch_size * channel(1) * seq_len(H) * embed_dim(W)
C = [F.relu(conv(X)) for conv in self.conv_list]
C_hat = torch.stack([F.max_pool2d(
c, c.size()[2:]).squeeze() for c in C], dim=1)
return C_hat
class Classifier(nn.Module):
def __init__(self, sr_model, output_dim, vocab_size, embed_dim, **kwargs):
super().__init__()
self.sr_model = sr_model(vocab_size=vocab_size, embed_dim=embed_dim, **kwargs)
self.input_dim = self.sr_model.output_dim
self.output_dim = output_dim
self.fc = nn.Linear(self.input_dim, self.output_dim)
def forward(self, x):
return self.fc(self.sr_model(x))
model = Classifier(sr_model=SentenceCNN, output_dim=2, vocab_size=len(vocabs), embed_dim=16)
model.sr_model.embeddings.weight[1]
tensor([-1.3872, 0.0734, -1.1280, -1.3668, -0.3136, 1.6393, -0.2068, 1.1680, -1.2082, 0.9183, -0.0433, -1.7420, -0.9645, -0.2875, 1.1536, -0.0579], grad_fn=<SelectBackward0>)
use_cuda = torch.cuda.is_available()
if use_cuda:
model.cuda()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)
calc_loss = nn.CrossEntropyLoss()
n_epoch = 10
global_i = 0
valid_loss_history = []
train_loss_history = []
best_model = None
best_epoch_i = None
min_valid_loss = 9e+9
for epoch_i in range(n_epoch):
model.train()
for batch in train_dataloader:
optimizer.zero_grad()
X = torch.tensor(batch['doc_ids'])
y = torch.tensor(batch['label'])
if use_cuda:
X = X.cuda()
y = y.cuda()
y_pred = model(X)
loss = calc_loss(y_pred, y)
if global_i % 1000 == 0:
print(f'i: {global_i}, epoch: {epoch_i}, loss: {loss.item()}')
train_loss_history.append((global_i, loss.item()))
loss.backward()
optimizer.step()
global_i += 1
model.eval()
valid_loss_list = []
for batch in valid_dataloader:
X = torch.tensor(batch['doc_ids'])
y = torch.tensor(batch['label'])
if use_cuda:
X = X.cuda()
y = y.cuda()
y_pred = model(X)
loss = calc_loss(y_pred, y)
valid_loss_list.append(loss.item())
valid_loss_mean = np.mean(valid_loss_list)
valid_loss_history.append((global_i, valid_loss_mean.item()))
if valid_loss_mean < min_valid_loss:
min_valid_loss = valid_loss_mean
best_epoch_i = epoch_i
best_model = deepcopy(model)
if epoch_i % 2 == 0:
print("*"*30)
print(f'valid_loss_mean: {valid_loss_mean}')
print("*"*30)
print(f'best_epoch: {best_epoch_i}')
i: 0, epoch: 0, loss: 0.7011118531227112 ****************************** valid_loss_mean: 0.4602986348436234 ****************************** i: 1000, epoch: 1, loss: 0.3827470541000366 i: 2000, epoch: 2, loss: 0.29700979590415955 ****************************** valid_loss_mean: 0.4793511497213485 ****************************** i: 3000, epoch: 3, loss: 0.2813290059566498 i: 4000, epoch: 4, loss: 0.21812118589878082 ****************************** valid_loss_mean: 0.5715509579536763 ****************************** i: 5000, epoch: 5, loss: 0.21523813903331757 i: 6000, epoch: 6, loss: 0.23151689767837524 ****************************** valid_loss_mean: 0.7064357346676765 ****************************** i: 7000, epoch: 7, loss: 0.15933875739574432 i: 8000, epoch: 8, loss: 0.12855540215969086 ****************************** valid_loss_mean: 0.8573142164565147 ****************************** i: 9000, epoch: 9, loss: 0.17837409675121307 best_epoch: 1
def calc_moving_average(arr, win_size=100):
new_arr = []
win = []
for i, val in enumerate(arr):
win.append(val)
if len(win) > win_size:
win.pop(0)
new_arr.append(np.mean(win))
return np.array(new_arr)
valid_loss_history = np.array(valid_loss_history)
train_loss_history = np.array(train_loss_history)
plt.figure(figsize=(12,8))
plt.plot(train_loss_history[:,0],
calc_moving_average(train_loss_history[:,1]), color='blue')
plt.plot(valid_loss_history[:,0],
valid_loss_history[:,1], color='red')
plt.xlabel("step")
plt.ylabel("loss")
Text(0, 0.5, 'loss')
Evaluation¶
model = best_model
model.eval()
total = 0
correct = 0
for batch in tqdm(test_dataloader,
total=len(test_dataloader.dataset)//test_dataloader.batch_size):
X = torch.tensor(batch['doc_ids'])
y = torch.tensor(batch['label'])
if use_cuda:
X = X.cuda()
y = y.cuda()
y_pred = model(X)
curr_correct = y_pred.argmax(dim=1) == y
total += len(curr_correct)
correct += sum(curr_correct)
print(f'test accuracy: {correct/total}')
0%| | 0/390 [00:00<?, ?it/s]
test accuracy: 0.7748399972915649