|
--- |
|
language: tr |
|
Dataset: interpress_news_category_tr |
|
--- |
|
# INTERPRESS NEWS CLASSIFICATION |
|
## Dataset |
|
The dataset downloaded from interpress. This dataset is real world data. Actually there are 273K data but I filtered them and used 108K data for this model. For more information about dataset please visit this [link](https://huggingface.co/datasets/interpress_news_category_tr) |
|
|
|
## Model |
|
Model accuracy on train data and validation data is %97. |
|
|
|
## Usage |
|
```sh |
|
pip install transformers or pip install transfomers==4.3.3 |
|
``` |
|
```sh |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Serdar/bert-model") |
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("Serdar/bert-model") |
|
``` |
|
```sh |
|
# PREPROCESSING |
|
import re |
|
my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~" |
|
|
|
def clean_url(content): |
|
reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?' |
|
pattern_url = re.compile(reg_url) |
|
content = pattern_url.sub('',content) |
|
return content |
|
|
|
def clean_email(content): |
|
reg_email='\S*@\S*\s?' |
|
pattern_email = re.compile(reg_email) |
|
content = pattern_email.sub('',content) |
|
return content |
|
|
|
def clean_punctuation(content): |
|
content = content.translate(content.maketrans("", "", my_punc)) |
|
return content |
|
|
|
def clean_data(text): |
|
text = clean_url(text) |
|
text = clean_email(text) |
|
text = clean_punctuation(text) |
|
|
|
filtered_sentence = [] |
|
for word in text.split(" "): |
|
if len(word) > 2: |
|
filtered_sentence.append(word) |
|
|
|
text = ' '.join(filtered_sentence) |
|
return text |
|
``` |
|
```sh |
|
import torch |
|
import numpy as np |
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device("cuda") |
|
model = model.cuda() |
|
print('There are %d GPU(s) available.' % torch.cuda.device_count()) |
|
print('GPU name is:', torch.cuda.get_device_name(0)) |
|
else: |
|
print('No GPU available, using the CPU instead.') |
|
device = torch.device("cpu") |
|
``` |
|
```sh |
|
def prediction(news): |
|
news=clean_data(news) |
|
news=[news] |
|
indices=tokenizer.batch_encode_plus( |
|
news, |
|
max_length=512, |
|
add_special_tokens=True, |
|
return_attention_mask=True, |
|
padding='max_length', |
|
truncation=True, |
|
return_tensors='pt') # for tf tensors, switch pt to tf |
|
|
|
inputs = indices["input_ids"].clone().detach().to(device) |
|
masks = indices["attention_mask"].clone().detach().to(device) |
|
|
|
with torch.no_grad(): |
|
output = model(inputs, token_type_ids=None,attention_mask=masks) |
|
|
|
logits = output[0] |
|
logits = logits.detach().cpu().numpy() |
|
pred = np.argmax(logits,axis=1)[0] |
|
return pred |
|
``` |
|
|
|
```sh |
|
labels = { |
|
0 : "Culture-Art", |
|
1 : "Economy", |
|
2 : "Politics", |
|
3 : "Education", |
|
4 : "World", |
|
5 : "Sport", |
|
6 : "Technology", |
|
7 : "Magazine", |
|
8 : "Health", |
|
9 : "Agenda" |
|
} |
|
pred = prediction(news) |
|
print(labels[pred]) |
|
``` |
|
Thanks to @yavuzkomecoglu for contributes |
|
|
|
If you have any question, please, don't hesitate to contact with me |
|
[![linkedin](https://img.shields.io/badge/LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/serdarakyol55/) |
|
[![Github](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/serdarakyol) |