Spaces:
Sleeping
Sleeping
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.linear_model import LogisticRegression | |
from transformers import AutoModelForSequenceClassification | |
from transformers import BertTokenizerFast | |
import torch | |
import re | |
import string | |
import pickle | |
import streamlit as st | |
import base64 | |
import plotly.express as px | |
df = px.data.iris() | |
def get_img_as_base64(file): | |
with open(file, "rb") as f: | |
data = f.read() | |
return base64.b64encode(data).decode() | |
#img = get_img_as_base64("https://wallpapercave.com/wp/wp11966930.jpg") | |
page_bg_img = f""" | |
<style> | |
[data-testid="stAppViewContainer"] > .main {{ | |
background-image: url("https://ibb.co/Df1FvYL"); | |
background-size: 115%; | |
background-position: top left; | |
background-repeat: no-repeat; | |
background-attachment: local; | |
}} | |
[data-testid="stSidebar"] > div:first-child {{ | |
background-image: url("https://ibb.co/ZBkdJRg"); | |
background-size: 115%; | |
background-position: center; | |
background-repeat: no-repeat; | |
background-attachment: fixed; | |
}} | |
[data-testid="stHeader"] {{ | |
background: rgba(0,0,0,0); | |
}} | |
[data-testid="stToolbar"] {{ | |
right: 2rem; | |
}} | |
div.css-1n76uvr.e1tzin5v0 {{ | |
background-color: rgba(238, 238, 238, 0.5); | |
border: 10px solid #EEEEEE; | |
padding: 5% 5% 5% 10%; | |
border-radius: 5px; | |
}} | |
</style> | |
""" | |
st.markdown(page_bg_img, unsafe_allow_html=True) | |
# Функция очистки текста | |
def clean(text): | |
text = text.lower() # нижний регистр | |
text = re.sub(r'http\S+', " ", text) # удаляем ссылки | |
text = re.sub(r'@\w+',' ',text) # удаляем упоминания пользователей | |
text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги | |
text = re.sub(r'\d+', ' ', text) # удаляем числа | |
return text | |
# Загрузка весов модели ML | |
model_filename = 'model_comments_weights.pkl' | |
with open(model_filename, 'rb') as file: | |
model = pickle.load(file) | |
# Загрузка весов векторизатора | |
vectorizer = CountVectorizer() | |
vectorizer_filename = 'vectorizer_comments_weights.pkl' | |
with open(vectorizer_filename, 'rb') as file: | |
vectorizer = pickle.load(file) | |
# Само приложение | |
#Готовая модель ruBert | |
tokenizer_bert = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment') | |
model_bert = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment', return_dict=True) | |
st.title("SafeTalk") | |
st.write("Your Personal Comment Filter is an innovative application that harnesses the power of AI to distinguish toxic comments from the rest.") | |
st.write("Empowering users to navigate online discussions with confidence, SafeTalk ensures a more constructive and respectful online community by identifying and flagging harmful content.") | |
user_review = st.text_input("Enter your comment:", "") | |
user_review_clean = clean(user_review) | |
user_features = vectorizer.transform([user_review_clean]) | |
prediction = model.predict(user_features) | |
inputs = tokenizer_bert(user_review_clean, max_length=512, padding=True, truncation=True, return_tensors='pt') | |
outputs = model_bert(**inputs) | |
prediction_bert = torch.nn.functional.softmax(outputs.logits, dim=1) | |
prediction_bert = torch.argmax(prediction_bert, dim=1).numpy() | |
st.write("Comment by ML model:", user_review) | |
if user_review.strip(): | |
if prediction == 0: | |
st.markdown("<p style='color: green;'>Non-toxic comment</p>", unsafe_allow_html=True) | |
else: | |
st.markdown("<p style='color: red;'>Toxic comment</p>", unsafe_allow_html=True) | |
st.write("Comment by RuBERT:", user_review) | |
if prediction_bert == 0: | |
st.markdown("<p style='color: green;'>Controversial comment</p>", unsafe_allow_html=True) | |
elif prediction_bert == 1: | |
st.markdown("<p style='color: red;'>Non-toxic comment</p>", unsafe_allow_html=True) | |
else: | |
st.markdown("<p style='color: red;'>Toxic comment</p>", unsafe_allow_html=True) | |