here is my python script, to process about 1m files, mainly to check if these files exists:
def check_data_valid():
data_root = "/data_alpha/data/video/internvid/"
save_root = "/data_alpha/data/video/internvid/all_file_list/proc"
download_list = "/data_alpha/data/video/internvid/all_file_list/all_files.txt"
valid_data = []
err_data = []
missing_data = []
with open(download_list, "r") as f:
a = f.readlines()
downloads = [i.strip() for i in a]
for i, item in enumerate(downloads):
file = os.path.join(data_root, item)
avi = file.replace(".mp4", ".avi")
if os.path.exists(avi):
valid_data.append(item)
elif os.path.exists(file):
err_data.append(item)
else:
missing_data.append(item)
if (i) % 200000 == 1:
if len(valid_data) > 0:
with open(os.path.join(save_root, f"valid_{len(valid_data)}_{i}.txt"), "w") as f:
f.writelines("\n".join(valid_data))
if len(err_data) > 0:
with open(os.path.join(save_root, f"error_{len(err_data)}_{i+start}.txt"), "w") as f:
f.writelines("\n".join(err_data))
if len(missing_data) > 0:
with open(os.path.join(save_root, f"missing_{len(missing_data)}_{i}.txt"), "w") as f:
f.writelines("\n".join(missing_data))
if len(valid_data) > 0:
with open(os.path.join(save_root, f"valid_total.txt"), "w") as f:
f.writelines("\n".join(valid_data))
if len(err_data) > 0:
with open(os.path.join(save_root, f"error_total.txt"), "w") as f:
f.writelines("\n".join(err_data))
if len(missing_data) > 0:
with open(os.path.join(save_root, f"missing_total.txt"), "w") as f:
f.writelines("\n".join(missing_data))
if __name__ == '__main__':
check_data_valid()
but after run serveral hours, and processed about 100k files by reviewing the files in save_root. Then this process will turn into status D (by use ps
command to see), which means Uninterruptible Sleep, I wonder why this happens and how to avoid this.