metadata

language: tr
Dataset: interpress_news_category_tr

INTERPRESS NEWS CLASSIFICATION

Dataset

The dataset downloaded from interpress. This dataset is real world data. Actually there are 273K data but I filtered them and used 108K data for this model. For more information about dataset please visit this link

Model

Model accuracy on train data and validation data is %97.

Usage

pip install transformers or pip install transfomers==4.3.3

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("serdarakyol/interpress-turkish-news-classification")

model = AutoModelForSequenceClassification.from_pretrained("serdarakyol/interpress-turkish-news-classification")

# PREPROCESSING
import re
my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~"

def clean_url(content):
    reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?'
    pattern_url = re.compile(reg_url)
    content = pattern_url.sub('',content)
    return content
    
def clean_email(content):
    reg_email='\S*@\S*\s?'
    pattern_email = re.compile(reg_email)
    content = pattern_email.sub('',content)
    return content
    
def clean_punctuation(content):
    content = content.translate(content.maketrans("", "", my_punc))
    return content

def clean_data(text):
    text = clean_url(text)
    text = clean_email(text)
    text = clean_punctuation(text)

    filtered_sentence = []
    for word in text.split(" "):
        if len(word) > 2:
            filtered_sentence.append(word)
    
    text = ' '.join(filtered_sentence)
    return text

import torch
import numpy as np

if torch.cuda.is_available():    
    device = torch.device("cuda")
    model = model.cuda()
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU name is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

def prediction(news):
    news=clean_data(news)
    news=[news]
    indices=tokenizer.batch_encode_plus(
    news,
    max_length=512,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    truncation=True,
    return_tensors='pt') # for tf tensors, switch pt to tf

    inputs = indices["input_ids"].clone().detach().to(device)
    masks = indices["attention_mask"].clone().detach().to(device)

    with torch.no_grad():
        output = model(inputs, token_type_ids=None,attention_mask=masks)

    logits = output[0]
    logits = logits.detach().cpu().numpy()
    pred = np.argmax(logits,axis=1)[0]
    return pred

labels = {
    0 : "Culture-Art",
    1 : "Economy",
    2 : "Politics",
    3 : "Education",
    4 : "World",
    5 : "Sport",
    6 : "Technology",
    7 : "Magazine",
    8 : "Health",
    9 : "Agenda"
}
pred = prediction(news)
print(labels[pred])

Thanks to @yavuzkomecoglu for contributes

If you have any question, please, don't hesitate to contact with me