#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Universal Data Cleaner v2 – fokus pada akurasi dan efisiensi memori. Author : fallenstarzz (modifikasi oleh ChatGPT) Licence : MIT """ import os import re from datetime import datetime from itertools import islice # ────────────────────────────────────────────────────────────── # 1. REGEX – pola diperketat & di-anchor # ────────────────────────────────────────────────────────────── PATTERNS = [ # https://domain.com/path:username:password re.compile(r'^https?://[^/]+/.+?:([^:\s]+):(.+)$', re.I), # http://domain.com:username:password re.compile(r'^https?://[^:]+:([^:\s]+):(.+)$', re.I), # domain.com:username:password re.compile(r'^[a-z0-9.-]+\.[a-z]{2,}:([^:\s]+):(.+)$', re.I), # anythinglogin...:username:password (login|signup|auth|register) re.compile(r'^[^:]*(?:login|signup|auth|register)[^:]*:([^:\s]+):(.+)$', re.I), # generic three-column foo:username:password re.compile(r'^[^:]+:([^:\s]+):(.+)$'), ] # ────────────────────────────────────────────────────────────── # 2. UTILITAS # ────────────────────────────────────────────────────────────── def is_comment(line: str) -> bool: """Abaikan baris yg hanya komentar (#) – leading spasi diizinkan.""" return not line.strip() or line.lstrip().startswith("#") def parse_line(line: str): """Coba cocokkan line ke setiap pattern, kembalikan (u, p) atau None.""" for pat in PATTERNS: m = pat.match(line) if m: user, pwd = m.group(1).strip(), m.group(2).strip() # Pastikan nilai valid & tanpa spasi / kontrol if user and pwd and not re.search(r'[\s\0-\x1f]', user + pwd): return f"{user}:{pwd}" return None # ────────────────────────────────────────────────────────────── # 3. FUNGSI UTAMA – streaming & deduplikasi # ────────────────────────────────────────────────────────────── def universal_data_cleaner(): banner = "=" * 54 print(f"{banner}\n UNIVERSAL DATA CLEANER v2\n{banner}") print(f"User : fallenstarzz") print(f"Date : {datetime.utcnow():%Y-%m-%d %H:%M:%S} UTC\n{banner}\n") # ── Input ░░ src = input("📁 Nama file input: ").strip() if not os.path.isfile(src): print(f"❌ File '{src}' tidak ditemukan.") return dst = input("💾 Nama file output (enter = auto): ").strip() if not dst: base = os.path.splitext(src)[0] dst = f"{base}_cleaned.txt" print(f"\n🔄 Memproses '{src}' → '{dst}' …\n") total, ok = 0, 0 skipped_preview, preview_out = [], [] seen = set() # deduplikasi try: with open(src, encoding="utf-8", errors="replace") as fin, \ open(dst, "w", encoding="utf-8") as fout: for line in fin: total += 1 if is_comment(line): continue result = parse_line(line.rstrip("\n")) if result: if result not in seen: seen.add(result) fout.write(result + "\n") ok += 1 if ok <= 10: preview_out.append(result) else: if len(skipped_preview) < 5: skipped_preview.append(line[:60]) # Progress tiap 10 000 baris – if total % 10_000 == 0: print(f" … {total:,} baris → {ok:,} valid") except Exception as e: print(f"\n❌ ERROR: {e}") return # ── Ringkasan ░░ print("\n" + banner) print("📊 RINGKASAN") print(banner) print(f"Total baris dibaca : {total:,}") print(f"Data valid tersimpan: {ok:,}") print(f"Duplikat dihapus : {len(seen) - ok:,}") print(f"Baris terlewat : {total - ok:,}") if preview_out: print("\n👀 Contoh output:") for i, v in enumerate(preview_out, 1): print(f"{i:2}. {v}") if ok > 10: print(f" … dan {ok - 10} lainnya") if skipped_preview: print("\n⚠️ Contoh baris terlewat:") for s in skipped_preview: print(" ", s + ("…" if len(s) == 60 else "")) print("\n✅ Selesai!\n") if __name__ == "__main__": universal_data_cleaner()