serdarakyol commited on
Commit
48b9a4a
1 Parent(s): fcd822a

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +115 -0
README.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: tr
3
+ Dataset: interpress_news_category_tr
4
+ ---
5
+ # INTERPRESS NEWS CLASSIFICATION
6
+ ## Dataset
7
+ The dataset downloaded from interpress. This dataset is real world data. Actually there are 273K data but I filtered them and used 108K data for this model. For more information about dataset please visit this [link](https://huggingface.co/datasets/interpress_news_category_tr)
8
+
9
+ ## Model
10
+ Model accuracy on train data and validation data is %97.
11
+
12
+ ## Usage
13
+ ```sh
14
+ pip install transformers or pip install transfomers==4.3.3
15
+ ```
16
+ ```sh
17
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained("Serdar/bert-model")
20
+
21
+ model = AutoModelForSequenceClassification.from_pretrained("Serdar/bert-model")
22
+ ```
23
+ ```sh
24
+ # PREPROCESSING
25
+ import re
26
+ my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~"
27
+
28
+ def clean_url(content):
29
+ reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?'
30
+ pattern_url = re.compile(reg_url)
31
+ content = pattern_url.sub('',content)
32
+ return content
33
+
34
+ def clean_email(content):
35
+ reg_email='\S*@\S*\s?'
36
+ pattern_email = re.compile(reg_email)
37
+ content = pattern_email.sub('',content)
38
+ return content
39
+
40
+ def clean_punctuation(content):
41
+ content = content.translate(content.maketrans("", "", my_punc))
42
+ return content
43
+
44
+ def clean_data(text):
45
+ text = clean_url(text)
46
+ text = clean_email(text)
47
+ text = clean_punctuation(text)
48
+
49
+ filtered_sentence = []
50
+ for word in text.split(" "):
51
+ if len(word) > 2:
52
+ filtered_sentence.append(word)
53
+
54
+ text = ' '.join(filtered_sentence)
55
+ return text
56
+ ```
57
+ ```sh
58
+ import torch
59
+ import numpy as np
60
+
61
+ if torch.cuda.is_available():
62
+ device = torch.device("cuda")
63
+ model = model.cuda()
64
+ print('There are %d GPU(s) available.' % torch.cuda.device_count())
65
+ print('GPU name is:', torch.cuda.get_device_name(0))
66
+ else:
67
+ print('No GPU available, using the CPU instead.')
68
+ device = torch.device("cpu")
69
+ ```
70
+ ```sh
71
+ def prediction(news):
72
+ news=clean_data(news)
73
+ news=[news]
74
+ indices=tokenizer.batch_encode_plus(
75
+ news,
76
+ max_length=512,
77
+ add_special_tokens=True,
78
+ return_attention_mask=True,
79
+ padding='max_length',
80
+ truncation=True,
81
+ return_tensors='pt') # for tf tensors, switch pt to tf
82
+
83
+ inputs = indices["input_ids"].clone().detach().to(device)
84
+ masks = indices["attention_mask"].clone().detach().to(device)
85
+
86
+ with torch.no_grad():
87
+ output = model(inputs, token_type_ids=None,attention_mask=masks)
88
+
89
+ logits = output[0]
90
+ logits = logits.detach().cpu().numpy()
91
+ pred = np.argmax(logits,axis=1)[0]
92
+ return pred
93
+ ```
94
+
95
+ ```sh
96
+ labels = {
97
+ 0 : "Culture-Art",
98
+ 1 : "Economy",
99
+ 2 : "Politics",
100
+ 3 : "Education",
101
+ 4 : "World",
102
+ 5 : "Sport",
103
+ 6 : "Technology",
104
+ 7 : "Magazine",
105
+ 8 : "Health",
106
+ 9 : "Agenda"
107
+ }
108
+ pred = prediction(news)
109
+ print(labels[pred])
110
+ ```
111
+ Thanks to @yavuzkomecoglu for contributes
112
+
113
+ If you have any question, please, don't hesitate to contact with me
114
+ [![linkedin](https://img.shields.io/badge/LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/serdarakyol55/)
115
+ [![Github](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/serdarakyol)