chinhon commited on
Commit
64ae0cc
1 Parent(s): 0e8c8fe

py file for Malay headlines writer app

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ AutoModelForSeq2SeqLM,
7
+ )
8
+
9
+ def clean_text(text):
10
+ text = text.encode("ascii", errors="ignore").decode(
11
+ "ascii"
12
+ ) # remove non-ascii, Chinese characters
13
+ text = re.sub(r"http\S+", "", text)
14
+ text = re.sub(r"\n", " ", text)
15
+ text = re.sub(r"\n\n", " ", text)
16
+ text = re.sub(r"\t", " ", text)
17
+ text = text.strip(" ")
18
+ text = re.sub(
19
+ " +", " ", text
20
+ ).strip() # get rid of multiple spaces and replace with a single
21
+ return text
22
+
23
+
24
+ model_name = "chinhon/pegasus-newsroom-malay_headlines"
25
+
26
+ def headline_writer(text):
27
+ input_text = clean_text(text)
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+
31
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
32
+
33
+ with tokenizer.as_target_tokenizer():
34
+ batch = tokenizer(
35
+ input_text,
36
+ truncation=True,
37
+ max_length=1024,
38
+ padding="longest",
39
+ return_tensors="pt",
40
+ )
41
+
42
+ raw_write = model.generate(**batch)
43
+
44
+ headline = tokenizer.batch_decode(
45
+ raw_write, skip_special_tokens=True, min_length=100, length_penalty=100.1
46
+ )
47
+
48
+ return headline[0]
49
+
50
+
51
+ gradio_ui = gr.Interface(
52
+ fn=headline_writer,
53
+ title="Malay News Headlines Generator",
54
+ description="Too busy or tired to write a headline? Try this instead.",
55
+ inputs=gr.inputs.Textbox(
56
+ lines=20, label="Paste the first few paras of a Malay language news story here"
57
+ ),
58
+ outputs=gr.outputs.Textbox(label="Suggested Headline"),
59
+ theme="darkdefault"
60
+ )
61
+
62
+
63
+ gradio_ui.launch()