Lim0011 commited on
Commit
960d190
1 Parent(s): de7cb37

Upload 2 files

Browse files
Files changed (2) hide show
  1. example/ex2_final.py +140 -0
  2. example/ex2_init.py +61 -0
example/ex2_final.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.optim as optim
7
+ from torch.utils.data import DataLoader, Dataset
8
+ from transformers import BertTokenizer, BertModel
9
+
10
+ # Define constants
11
+ DIMENSIONS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
12
+
13
+ class EssayDataset(Dataset):
14
+ def __init__(self, texts, targets, tokenizer, max_len):
15
+ self.texts = texts
16
+ self.targets = targets
17
+ self.tokenizer = tokenizer
18
+ self.max_len = max_len
19
+
20
+ def __len__(self):
21
+ return len(self.texts)
22
+
23
+ def __getitem__(self, item):
24
+ text = self.texts[item]
25
+ target = self.targets[item]
26
+
27
+ encoding = self.tokenizer.encode_plus(
28
+ text,
29
+ add_special_tokens=True,
30
+ max_length=self.max_len,
31
+ return_token_type_ids=False,
32
+ padding='max_length',
33
+ return_attention_mask=True,
34
+ return_tensors='pt',
35
+ truncation=True
36
+ )
37
+
38
+ return {
39
+ 'text': text,
40
+ 'input_ids': encoding['input_ids'].flatten(),
41
+ 'attention_mask': encoding['attention_mask'].flatten(),
42
+ 'targets': torch.tensor(target, dtype=torch.float)
43
+ }
44
+
45
+ class EssayScoreRegressor(nn.Module):
46
+ def __init__(self, n_outputs):
47
+ super(EssayScoreRegressor, self).__init__()
48
+ self.bert = BertModel.from_pretrained('bert-base-uncased')
49
+ self.drop = nn.Dropout(p=0.3)
50
+ self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
51
+
52
+ def forward(self, input_ids, attention_mask):
53
+ pooled_output = self.bert(
54
+ input_ids=input_ids,
55
+ attention_mask=attention_mask
56
+ )['pooler_output']
57
+ output = self.drop(pooled_output)
58
+ return self.out(output)
59
+
60
+ def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
61
+ model = model.train()
62
+ losses = []
63
+
64
+ for d in data_loader:
65
+ input_ids = d['input_ids'].to(device)
66
+ attention_mask = d['attention_mask'].to(device)
67
+ targets = d['targets'].to(device)
68
+
69
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
70
+ loss = loss_fn(outputs, targets)
71
+
72
+ losses.append(loss.item())
73
+
74
+ loss.backward()
75
+ optimizer.step()
76
+ scheduler.step()
77
+ optimizer.zero_grad()
78
+
79
+ return np.mean(losses)
80
+
81
+ def train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs, batch_size, max_len):
82
+ train_dataset = EssayDataset(
83
+ texts=train_data['full_text'].to_numpy(),
84
+ targets=train_data[DIMENSIONS].to_numpy(),
85
+ tokenizer=tokenizer,
86
+ max_len=max_len
87
+ )
88
+
89
+ val_dataset = EssayDataset(
90
+ texts=val_data['full_text'].to_numpy(),
91
+ targets=val_data[DIMENSIONS].to_numpy(),
92
+ tokenizer=tokenizer,
93
+ max_len=max_len
94
+ )
95
+
96
+ train_data_loader = DataLoader(
97
+ train_dataset,
98
+ batch_size=batch_size,
99
+ shuffle=True
100
+ )
101
+
102
+ val_data_loader = DataLoader(
103
+ val_dataset,
104
+ batch_size=batch_size,
105
+ shuffle=False
106
+ )
107
+
108
+ loss_fn = nn.MSELoss().to(device)
109
+
110
+ for epoch in range(epochs):
111
+ print(f'Epoch {epoch + 1}/{epochs}')
112
+ print('-' * 10)
113
+
114
+ train_loss = train_epoch(
115
+ model,
116
+ train_data_loader,
117
+ loss_fn,
118
+ optimizer,
119
+ device,
120
+ scheduler,
121
+ len(train_dataset)
122
+ )
123
+
124
+ print(f'Train loss {train_loss}')
125
+
126
+ if __name__ == "__main__":
127
+ df = pd.read_csv('train.csv')
128
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
129
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
130
+ model = EssayScoreRegressor(n_outputs=len(DIMENSIONS))
131
+ model = model.to(device)
132
+
133
+ optimizer = optim.Adam(model.parameters(), lr=2e-5)
134
+ total_steps = len(df) // 16 * 5
135
+ scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=total_steps, gamma=0.1)
136
+
137
+ train_data = df.sample(frac=0.8, random_state=42)
138
+ val_data = df.drop(train_data.index)
139
+
140
+ train_model(train_data, val_data, tokenizer, model, optimizer, scheduler, device, epochs=5, batch_size=16, max_len=160)
example/ex2_init.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
3
+ import numpy as np
4
+ import random
5
+ import torch
6
+ from sklearn.model_selection import train_test_split
7
+
8
+ DIMENSIONS = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
9
+ SEED = 42
10
+
11
+ random.seed(SEED)
12
+ torch.manual_seed(SEED)
13
+ np.random.seed(SEED)
14
+
15
+ def compute_metrics_for_regression(y_test, y_test_pred):
16
+ metrics = {}
17
+ for task in DIMENSIONS:
18
+ targets_task = [t[DIMENSIONS.index(task)] for t in y_test]
19
+ pred_task = [l[DIMENSIONS.index(task)] for l in y_test_pred]
20
+
21
+ rmse = mean_squared_error(targets_task, pred_task, squared=False)
22
+
23
+ metrics[f"rmse_{task}"] = rmse
24
+
25
+ return metrics
26
+
27
+ def train_model(X_train, y_train, X_valid, y_valid):
28
+ model = None # Placeholder for model training
29
+ return model
30
+
31
+ def predict(model, X):
32
+ y_pred = np.random.rand(len(X), len(DIMENSIONS))
33
+ return y_pred
34
+
35
+ if __name__ == '__main__':
36
+
37
+ ellipse_df = pd.read_csv('train.csv',
38
+ header=0, names=['text_id', 'full_text', 'Cohesion', 'Syntax',
39
+ 'Vocabulary', 'Phraseology','Grammar', 'Conventions'],
40
+ index_col='text_id')
41
+ ellipse_df = ellipse_df.dropna(axis=0)
42
+
43
+ data_df = ellipse_df
44
+ X = list(data_df.full_text.to_numpy())
45
+ y = np.array([data_df.drop(['full_text'], axis=1).iloc[i] for i in range(len(X))])
46
+
47
+ X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED)
48
+
49
+ model = train_model(X_train, y_train, X_valid, y_valid)
50
+
51
+ y_valid_pred = predict(model, X_valid)
52
+ metrics = compute_metrics_for_regression(y_valid, y_valid_pred)
53
+ print(metrics)
54
+ print("final MCRMSE on validation set: ", np.mean(list(metrics.values())))
55
+
56
+ submission_df = pd.read_csv('test.csv', header=0, names=['text_id', 'full_text'], index_col='text_id')
57
+ X_submission = list(submission_df.full_text.to_numpy())
58
+ y_submission = predict(model, X_submission)
59
+ submission_df = pd.DataFrame(y_submission, columns=DIMENSIONS)
60
+ submission_df.index = submission_df.index.rename('text_id')
61
+ submission_df.to_csv('submission.csv')