import hashlib import sys output_file_path = sys.argv[2] input_file_path = sys.argv[1] chunk_size = 100000 # Process 100,000 lines at a time with open(output_file_path, "w", encoding='utf-8') as output_file: with open(input_file_path, "r", encoding='utf-8') as input_file: chunk = [] for i, line in enumerate(input_file): chunk.append(line) if i % chunk_size == 0 and i > 0: # Process the chunk completed_lines_hash = set() for line in chunk: hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest() if hashValue not in completed_lines_hash: output_file.write(line) completed_lines_hash.add(hashValue) chunk = [] # Process the last chunk if chunk: completed_lines_hash = set() for line in chunk: hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest() if hashValue not in completed_lines_hash: output_file.write(line) completed_lines_hash.add(hashValue)