Alyosha11
/

Phoneme

Model card Files Files and versions Community

Alyosha11 commited on May 10

Commit

4b60ebe

•

1 Parent(s): c4d0a5f

Upload 50k.py with huggingface_hub

Browse files

Files changed (1) hide show

50k.py +83 -0

50k.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import pandas as pd
+from multiprocessing import Pool
+import time
+from tqdm import tqdm
+def process_rows(args):
+    rows, output_directory = args
+    for index, row in rows.iterrows():
+        # Generate the output text file path
+        text_filename = f"row_{index}.txt"
+        text_file_path = os.path.join(output_directory, text_filename)
+        # Write the row to a text file
+        with open(text_file_path, 'w') as text_file:
+            text_file.write(','.join(row.astype(str)))
+# Directory containing the CSV files
+csv_directory = "extracted_csv_files"
+# Number of text files to generate
+target_count = 50000
+# Get the list of CSV files in the directory
+csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith(".csv")]
+# Create a directory to store the extracted text files
+output_directory = "extracted_text_files_50k"
+os.makedirs(output_directory, exist_ok=True)
+# Initialize variables
+total_count = 0
+file_index = 0
+# Start the timer
+start_time = time.time()
+# Create a progress bar
+progress_bar = tqdm(total=target_count, unit='files')
+# Process CSV files until the target count is reached
+while total_count < target_count and file_index < len(csv_files):
+    csv_file_path = csv_files[file_index]
+    # Read the CSV file using pandas
+    df = pd.read_csv(csv_file_path)
+    # Get the number of rows in the CSV file
+    num_rows = len(df)
+    # Calculate the number of rows to extract from the current CSV file
+    rows_to_extract = min(target_count - total_count, num_rows)
+    # Extract the rows from the CSV file
+    rows = df.iloc[:rows_to_extract]
+    # Create a multiprocessing pool
+    pool = Pool()
+    # Process the rows in parallel
+    pool.map(process_rows, [(rows, output_directory)])
+    # Close the multiprocessing pool
+    pool.close()
+    pool.join()
+    total_count += rows_to_extract
+    file_index += 1
+    # Update the progress bar
+    progress_bar.update(rows_to_extract)
+# Close the progress bar
+progress_bar.close()
+# End the timer
+end_time = time.time()
+# Calculate the execution time
+execution_time = end_time - start_time
+print(f"\nGenerated {total_count} text files.")
+print(f"Execution time: {execution_time:.2f} seconds.")