#importing the necessary libraries import re import nltk from nltk.tokenize import sent_tokenize nltk.download('punkt') import gradio as gr from gradio.mix import Parallel # Defining a function to read in the text file def read_in_text(url): with open(url, 'r') as file: article = file.read() return article #Doing some text preprocessing, more will still be needed later def clean_text(url): text = read_in_text(url) text = text.encode("ascii", errors="ignore").decode( "ascii" ) # remove non-ascii, Chinese characters text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text) text = re.sub(r"\n", " ", text) text = re.sub(r"\n\n", " ", text) text = re.sub(r"\t", " ", text) text = text.strip(" ") text = re.sub( " +", " ", text ).strip() # get rid of multiple spaces and replace with a single return text #importing the model and tokenizer for the headline generator from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, ) #initializing the tokenizer and the model model_type_2 ="chinhon/pegasus-newsroom-headline_writer" tokenizer_2 = AutoTokenizer.from_pretrained(model_type_2) model_2 = AutoModelForSeq2SeqLM.from_pretrained(model_type_2) #Defining a function to generate the headlines def headline_generator_2(file): input_text = clean_text(file.name) with tokenizer_2.as_target_tokenizer(): batch = tokenizer_2( input_text, truncation=True, padding="longest", return_tensors="pt" ) translated = model_2.generate(**batch) summary_2 = tokenizer_2.batch_decode(translated, skip_special_tokens=True, max_length=100) return summary_2[0] #creating an interface for the headline generator using gradio demo = gr.Interface(headline_generator_2, inputs=[gr.inputs.File(label="Drop your .txt file here", optional=False)], title = "HEADLINE GENERATOR", outputs=[gr.outputs.Textbox(label="Headline")], theme= "darkhuggingface") #launching the app if __name__ == "__main__": demo.launch(debug=True)