chinhon commited on
Commit
e072175
1 Parent(s): 6b309c0

new py file for headlines comparison

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+
4
+ from gradio.mix import Parallel
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForSeq2SeqLM,
8
+ )
9
+
10
+ def clean_text(text):
11
+ text = text.encode("ascii", errors="ignore").decode(
12
+ "ascii"
13
+ ) # remove non-ascii, Chinese characters
14
+ text = re.sub(r"\n", " ", text)
15
+ text = re.sub(r"\n\n", " ", text)
16
+ text = re.sub(r"\t", " ", text)
17
+ text = text.strip(" ")
18
+ text = re.sub(
19
+ " +", " ", text
20
+ ).strip() # get rid of multiple spaces and replace with a single
21
+ return text
22
+
23
+
24
+ modchoice_1 = "chinhon/pegasus-newsroom-malay_headlines"
25
+
26
+ def headline_writer1(text):
27
+ input_text = clean_text(text)
28
+
29
+ tokenizer_1 = AutoTokenizer.from_pretrained(modchoice_1)
30
+
31
+ model_1 = AutoModelForSeq2SeqLM.from_pretrained(modchoice_1)
32
+
33
+ with tokenizer_1.as_target_tokenizer():
34
+ batch = tokenizer_1(
35
+ input_text, truncation=True, padding="longest", return_tensors="pt"
36
+ )
37
+
38
+ translated = model_1.generate(**batch)
39
+
40
+ summary_1 = tokenizer_1.batch_decode(translated, skip_special_tokens=True)
41
+
42
+ return summary_1[0]
43
+
44
+
45
+ headline1 = gr.Interface(
46
+ fn=headline_writer1,
47
+ inputs=gr.inputs.Textbox(),
48
+ outputs=gr.outputs.Textbox(label=""),
49
+ )
50
+
51
+
52
+ modchoice_2 = "chinhon/pegasus-multi_news-malay_headlines_02"
53
+
54
+ def headline_writer2(text):
55
+ input_text = clean_text(text)
56
+
57
+ tokenizer_2 = AutoTokenizer.from_pretrained(modchoice_2)
58
+
59
+ model_2 = AutoModelForSeq2SeqLM.from_pretrained(modchoice_2)
60
+
61
+ with tokenizer_2.as_target_tokenizer():
62
+ batch = tokenizer_2(
63
+ input_text, truncation=True, padding="longest", return_tensors="pt"
64
+ )
65
+
66
+ translated = model_2.generate(**batch)
67
+
68
+ summary_2 = tokenizer_2.batch_decode(translated, skip_special_tokens=True)
69
+
70
+ return summary_2[0]
71
+
72
+
73
+ headline2 = gr.Interface(
74
+ fn=headline_writer2,
75
+ inputs=gr.inputs.Textbox(),
76
+ outputs=gr.outputs.Textbox(label=""),
77
+ )
78
+
79
+
80
+ Parallel(
81
+ headline1,
82
+ headline2,
83
+ title="Malay News Headlines Generator",
84
+ inputs=gr.inputs.Textbox(
85
+ lines=20,
86
+ label="Paste the first few paragraphs of a Malay language news story here, and choose from 2 suggested headlines",
87
+ ),
88
+ theme="darkdefault"
89
+ ).launch()