Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

tokenizer-demo / app.py

Takateru Yamakoshi

fix

493d41a 9 months ago

6.18 kB

	import pandas as pd
	import streamlit as st
	import numpy as np
	import torch
	import io
	import time

	@st.cache(show_spinner=True,allow_output_mutation=True)
	def load_model(tokenizer_name):
	from transformers import AutoTokenizer
	model_name_dict = {
	"BERT":"bert-base-uncased",
	"RoBERTa":"roberta-base",
	"ALBERT":"albert-base-v2",
	"GPT2":"gpt2",
	#"Llama":"meta-lama/Llama-2-7b-chat-hf",
	#"Gemma":"google/gemma-7b",
	}
	tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
	return tokenizer

	def generate_markdown(text,color='black',font='Arial',size=20):
	return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"

	def TokenizeText(sentence,tokenizer_name):
	if len(sentence)>0:
	#if tokenizer_name.startswith('gpt2'):
	# input_sent = tokenizer(sentence)['input_ids']
	#else:
	# input_sent = tokenizer(sentence)['input_ids'][1:-1]
	input_sent = tokenizer(sentence)['input_ids']
	encoded_sent = [str(token) for token in input_sent]
	decoded_sent = [tokenizer.decode([token]) for token in input_sent]
	num_tokens = len(decoded_sent)

	#char_nums = [len(word)+2 for word in decoded_sent]
	#word_cols = st.columns(char_nums)
	#for word_col,word in zip(word_cols,decoded_sent):
	#with word_col:
	#st.write(word)
	#st.write(' '.join(encoded_sent))
	#st.write(' '.join(decoded_sent))
	st.markdown(generate_markdown(' '.join(encoded_sent),size=16), unsafe_allow_html=True)
	st.markdown(generate_markdown(' '.join(decoded_sent),size=16), unsafe_allow_html=True)
	st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)

	return num_tokens

	def DeTokenizeText(input_str):
	if len(input_str)>0:
	input_sent = [int(element) for element in input_str.strip().split(' ')]
	encoded_sent = [str(token) for token in input_sent]
	decoded_sent = tokenizer.decode(input_sent)
	num_tokens = len(input_sent)

	#char_nums = [len(word)+2 for word in decoded_sent]
	#word_cols = st.columns(char_nums)
	#for word_col,word in zip(word_cols,decoded_sent):
	#with word_col:
	#st.write(word)
	#st.write(' '.join(encoded_sent))
	#st.write(' '.join(decoded_sent))
	st.markdown(generate_markdown(decoded_sent), unsafe_allow_html=True)
	return num_tokens

	if __name__=='__main__':

	# Config
	max_width = 1500
	padding_top = 0
	padding_right = 2
	padding_bottom = 0
	padding_left = 2

	define_margins = f"""
	<style>
	.appview-container .main .block-container{{
	max-width: {max_width}px;
	padding-top: {padding_top}rem;
	padding-right: {padding_right}rem;
	padding-left: {padding_left}rem;
	padding-bottom: {padding_bottom}rem;
	}}
	</style>
	"""
	hide_table_row_index = """
	<style>
	tbody th {display:none}
	.blank {display:none}
	</style>
	"""
	st.markdown(define_margins, unsafe_allow_html=True)
	st.markdown(hide_table_row_index, unsafe_allow_html=True)

	# Title
	st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
	st.markdown(generate_markdown('- quick and easy way to explore how tokenizers work -',size=24), unsafe_allow_html=True)

	# Select and load the tokenizer
	st.sidebar.write('1. Choose the tokenizer from below')
	tokenizer_name = st.sidebar.selectbox('',
	("BERT","RoBERTa","ALBERT",
	"GPT2"))
	tokenizer = load_model(tokenizer_name)

	st.sidebar.write('2. Optional settings')
	comparison_mode = st.sidebar.checkbox('Compare two texts')
	detokenize = st.sidebar.checkbox('de-tokenize')
	st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
	+f'and "de-tokenize" converts a list of tokenized indices back to strings.')
	st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces.')
	if comparison_mode:
	sent_cols = st.columns(2)
	num_tokens = {}
	sents = {}
	for sent_id, sent_col in enumerate(sent_cols):
	with sent_col:
	if detokenize:
	sentence = st.text_input(f'Tokenized IDs {sent_id+1}')
	num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
	else:
	sentence = st.text_input(f'Text {sent_id+1}')
	num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
	sents[f'sent_{sent_id+1}'] = sentence

	if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
	st.markdown(generate_markdown('# Tokens&colon; ',size=16), unsafe_allow_html=True)
	if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
	st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
	else:
	st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)

	else:
	if detokenize:
	#if tokenizer_name.startswith('gpt2'):
	# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
	#else:
	# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
	default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
	sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
	num_tokens = DeTokenizeText(sentence)
	else:
	sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
	num_tokens = TokenizeText(sentence,tokenizer_name)