|
import os |
|
import pandas as pd |
|
from multiprocessing import Pool |
|
import time |
|
from tqdm import tqdm |
|
|
|
def process_rows(args): |
|
rows, output_directory = args |
|
for index, row in rows.iterrows(): |
|
|
|
text_filename = f"row_{index}.txt" |
|
text_file_path = os.path.join(output_directory, text_filename) |
|
|
|
|
|
with open(text_file_path, 'w') as text_file: |
|
text_file.write(','.join(row.astype(str))) |
|
|
|
|
|
csv_directory = "extracted_csv_files" |
|
|
|
|
|
target_count = 50000 |
|
|
|
|
|
csv_files = [os.path.join(csv_directory, file) for file in os.listdir(csv_directory) if file.endswith(".csv")] |
|
|
|
|
|
output_directory = "extracted_text_files_50k" |
|
os.makedirs(output_directory, exist_ok=True) |
|
|
|
|
|
total_count = 0 |
|
file_index = 0 |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
progress_bar = tqdm(total=target_count, unit='files') |
|
|
|
|
|
while total_count < target_count and file_index < len(csv_files): |
|
csv_file_path = csv_files[file_index] |
|
|
|
|
|
df = pd.read_csv(csv_file_path) |
|
|
|
|
|
num_rows = len(df) |
|
|
|
|
|
rows_to_extract = min(target_count - total_count, num_rows) |
|
|
|
|
|
rows = df.iloc[:rows_to_extract] |
|
|
|
|
|
pool = Pool() |
|
|
|
|
|
pool.map(process_rows, [(rows, output_directory)]) |
|
|
|
|
|
pool.close() |
|
pool.join() |
|
|
|
total_count += rows_to_extract |
|
file_index += 1 |
|
|
|
|
|
progress_bar.update(rows_to_extract) |
|
|
|
|
|
progress_bar.close() |
|
|
|
|
|
end_time = time.time() |
|
|
|
|
|
execution_time = end_time - start_time |
|
|
|
print(f"\nGenerated {total_count} text files.") |
|
print(f"Execution time: {execution_time:.2f} seconds.") |
|
|