abdulmatinomotoso commited on
Commit
b69d9b2
1 Parent(s): 86cc709

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #importing the necessary libraries
2
+
3
+ import re
4
+ import nltk
5
+ from nltk.tokenize import sent_tokenize
6
+ nltk.download('punkt')
7
+ import gradio as gr
8
+ from gradio.mix import Parallel
9
+
10
+ # Defining a function to read in the text file
11
+ def read_in_text(url):
12
+ with open(url, 'r') as file:
13
+ article = file.read()
14
+ return article
15
+
16
+ #Doing some text preprocessing, more will still be needed later
17
+ def clean_text(url):
18
+ text = read_in_text(url)
19
+ text = text.encode("ascii", errors="ignore").decode(
20
+ "ascii"
21
+ ) # remove non-ascii, Chinese characters
22
+
23
+ text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text)
24
+ text = re.sub(r"\n", " ", text)
25
+ text = re.sub(r"\n\n", " ", text)
26
+ text = re.sub(r"\t", " ", text)
27
+ text = text.strip(" ")
28
+ text = re.sub(
29
+ " +", " ", text
30
+ ).strip() # get rid of multiple spaces and replace with a single
31
+ return text
32
+
33
+ #importing the model and tokenizer for the headline generator
34
+ from transformers import (
35
+ AutoTokenizer,
36
+ AutoModelForSeq2SeqLM,
37
+ )
38
+
39
+ #initializing the tokenizer and the model
40
+ model_type_2 ="chinhon/pegasus-newsroom-headline_writer"
41
+ tokenizer_2 = AutoTokenizer.from_pretrained(model_type_2)
42
+ model_2 = AutoModelForSeq2SeqLM.from_pretrained(model_type_2)
43
+
44
+ #Defining a function to generate the headlines
45
+ def headline_generator_2(file):
46
+ input_text = clean_text(file.name)
47
+
48
+ with tokenizer_2.as_target_tokenizer():
49
+ batch = tokenizer_2(
50
+ input_text[:1000], truncation=True, padding="longest", return_tensors="pt"
51
+ )
52
+
53
+ translated = model_2.generate(**batch)
54
+ summary_2 = tokenizer_2.batch_decode(translated, skip_special_tokens=True)
55
+ return summary_2[0]
56
+
57
+ #creating an interface for the headline generator using gradio
58
+ demo = gr.Interface(headline_generator_2, inputs=[gr.inputs.File(label="Drop your .txt file here", optional=False)],
59
+ title = "HEADLINE GENERATOR",
60
+ outputs=[gr.outputs.Textbox(label="Summary")],
61
+ theme= "darkhuggingface")
62
+
63
+ #launching the app
64
+ if __name__ == "__main__":
65
+ demo.launch(debug=True)