input_dir="txt" | |
output_dir="sangraha_hi_phonemized" | |
lang=hi | |
num_files=50000 | |
num_jobs=-1 | |
process_file() { | |
input_file="$1" | |
output_file="$2" | |
lang=hi | |
# Create the output directory and its parent directories if they don't exist | |
mkdir -p "$(dirname "$output_file")" | |
phonemize --quiet -l $lang "$input_file" -o "$output_file" --strip --language-switch remove-flags --preserve-punctuation | |
echo "Processed: $input_file -> $output_file" | |
} | |
export -f process_file | |
# Start the timer | |
start_time=$(date +%s) | |
# Use GNU Parallel with find to process files in parallel | |
find "$input_dir" -type f -name "*.txt" | head -n $num_files | parallel -j $num_jobs process_file "{}" "${output_dir}/phn_$(basename {})" | |
# End the timer | |
end_time=$(date +%s) | |
# Calculate the elapsed time | |
elapsed_time=$((end_time - start_time)) | |
# Convert elapsed time to minutes and seconds | |
minutes=$((elapsed_time / 60)) | |
seconds=$((elapsed_time % 60)) | |
# Print the benchmark results | |
echo "Benchmark Results:" | |
echo "Number of files processed: $num_files" | |
echo "Number of parallel jobs: $num_jobs" | |
echo "Elapsed time: $minutes minutes $seconds seconds" | |