--- language: tr Dataset: interpress_news_category_tr --- # INTERPRESS NEWS CLASSIFICATION ## Dataset The dataset downloaded from interpress. This dataset is real world data. Actually there are 273K data but I filtered them and used 108K data for this model. For more information about dataset please visit this [link](https://huggingface.co/datasets/interpress_news_category_tr) ## Model Model accuracy on train data and validation data is %97. ## Usage ```sh pip install transformers or pip install transfomers==4.3.3 ``` ```sh from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("serdarakyol/interpress-turkish-news-classification") model = AutoModelForSequenceClassification.from_pretrained("serdarakyol/interpress-turkish-news-classification") ``` ```sh # PREPROCESSING import re my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~" def clean_url(content): reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?' pattern_url = re.compile(reg_url) content = pattern_url.sub('',content) return content def clean_email(content): reg_email='\S*@\S*\s?' pattern_email = re.compile(reg_email) content = pattern_email.sub('',content) return content def clean_punctuation(content): content = content.translate(content.maketrans("", "", my_punc)) return content def clean_data(text): text = clean_url(text) text = clean_email(text) text = clean_punctuation(text) filtered_sentence = [] for word in text.split(" "): if len(word) > 2: filtered_sentence.append(word) text = ' '.join(filtered_sentence) return text ``` ```sh import torch import numpy as np if torch.cuda.is_available(): device = torch.device("cuda") model = model.cuda() print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('GPU name is:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") ``` ```sh def prediction(news): news=clean_data(news) news=[news] indices=tokenizer.batch_encode_plus( news, max_length=512, add_special_tokens=True, return_attention_mask=True, padding='max_length', truncation=True, return_tensors='pt') # for tf tensors, switch pt to tf inputs = indices["input_ids"].clone().detach().to(device) masks = indices["attention_mask"].clone().detach().to(device) with torch.no_grad(): output = model(inputs, token_type_ids=None,attention_mask=masks) logits = output[0] logits = logits.detach().cpu().numpy() pred = np.argmax(logits,axis=1)[0] return pred ``` ```sh labels = { 0 : "Culture-Art", 1 : "Economy", 2 : "Politics", 3 : "Education", 4 : "World", 5 : "Sport", 6 : "Technology", 7 : "Magazine", 8 : "Health", 9 : "Agenda" } pred = prediction(news) print(labels[pred]) ``` Thanks to @yavuzkomecoglu for contributes If you have any question, please, don't hesitate to contact with me [![linkedin](https://img.shields.io/badge/LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/serdarakyol55/) [![Github](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/serdarakyol)