Sign up
Login
New paste
Home
Trending
Archive
English
English
Sign up
Login
New Paste
Browse
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Universal Data Cleaner v2 – fokus pada akurasi dan efisiensi memori. Author : fallenstarzz (modifikasi oleh ChatGPT) Licence : MIT """ import os import re from datetime import datetime from itertools import islice # ────────────────────────────────────────────────────────────── # 1. REGEX – pola diperketat & di-anchor # ────────────────────────────────────────────────────────────── PATTERNS = [ # https://domain.com/path:username:password re.compile(r'^https?://[^/]+/.+?:([^:\s]+):(.+)$', re.I), # http://domain.com:username:password re.compile(r'^https?://[^:]+:([^:\s]+):(.+)$', re.I), # domain.com:username:password re.compile(r'^[a-z0-9.-]+\.[a-z]{2,}:([^:\s]+):(.+)$', re.I), # anythinglogin...:username:password (login|signup|auth|register) re.compile(r'^[^:]*(?:login|signup|auth|register)[^:]*:([^:\s]+):(.+)$', re.I), # generic three-column foo:username:password re.compile(r'^[^:]+:([^:\s]+):(.+)$'), ] # ────────────────────────────────────────────────────────────── # 2. UTILITAS # ────────────────────────────────────────────────────────────── def is_comment(line: str) -> bool: """Abaikan baris yg hanya komentar (#) – leading spasi diizinkan.""" return not line.strip() or line.lstrip().startswith("#") def parse_line(line: str): """Coba cocokkan line ke setiap pattern, kembalikan (u, p) atau None.""" for pat in PATTERNS: m = pat.match(line) if m: user, pwd = m.group(1).strip(), m.group(2).strip() # Pastikan nilai valid & tanpa spasi / kontrol if user and pwd and not re.search(r'[\s\0-\x1f]', user + pwd): return f"{user}:{pwd}" return None # ────────────────────────────────────────────────────────────── # 3. FUNGSI UTAMA – streaming & deduplikasi # ────────────────────────────────────────────────────────────── def universal_data_cleaner(): banner = "=" * 54 print(f"{banner}\n UNIVERSAL DATA CLEANER v2\n{banner}") print(f"User : fallenstarzz") print(f"Date : {datetime.utcnow():%Y-%m-%d %H:%M:%S} UTC\n{banner}\n") # ── Input ░░ src = input("📁 Nama file input: ").strip() if not os.path.isfile(src): print(f"❌ File '{src}' tidak ditemukan.") return dst = input("💾 Nama file output (enter = auto): ").strip() if not dst: base = os.path.splitext(src)[0] dst = f"{base}_cleaned.txt" print(f"\n🔄 Memproses '{src}' → '{dst}' …\n") total, ok = 0, 0 skipped_preview, preview_out = [], [] seen = set() # deduplikasi try: with open(src, encoding="utf-8", errors="replace") as fin, \ open(dst, "w", encoding="utf-8") as fout: for line in fin: total += 1 if is_comment(line): continue result = parse_line(line.rstrip("\n")) if result: if result not in seen: seen.add(result) fout.write(result + "\n") ok += 1 if ok <= 10: preview_out.append(result) else: if len(skipped_preview) < 5: skipped_preview.append(line[:60]) # Progress tiap 10 000 baris – if total % 10_000 == 0: print(f" … {total:,} baris → {ok:,} valid") except Exception as e: print(f"\n❌ ERROR: {e}") return # ── Ringkasan ░░ print("\n" + banner) print("📊 RINGKASAN") print(banner) print(f"Total baris dibaca : {total:,}") print(f"Data valid tersimpan: {ok:,}") print(f"Duplikat dihapus : {len(seen) - ok:,}") print(f"Baris terlewat : {total - ok:,}") if preview_out: print("\n👀 Contoh output:") for i, v in enumerate(preview_out, 1): print(f"{i:2}. {v}") if ok > 10: print(f" … dan {ok - 10} lainnya") if skipped_preview: print("\n⚠️ Contoh baris terlewat:") for s in skipped_preview: print(" ", s + ("…" if len(s) == 60 else "")) print("\n✅ Selesai!\n") if __name__ == "__main__": universal_data_cleaner()
Paste Settings
Paste Title :
[Optional]
Paste Folder :
[Optional]
Select
Syntax Highlighting :
[Optional]
Select
Markup
CSS
JavaScript
Bash
C
C#
C++
Java
JSON
Lua
Plaintext
C-like
ABAP
ActionScript
Ada
Apache Configuration
APL
AppleScript
Arduino
ARFF
AsciiDoc
6502 Assembly
ASP.NET (C#)
AutoHotKey
AutoIt
Basic
Batch
Bison
Brainfuck
Bro
CoffeeScript
Clojure
Crystal
Content-Security-Policy
CSS Extras
D
Dart
Diff
Django/Jinja2
Docker
Eiffel
Elixir
Elm
ERB
Erlang
F#
Flow
Fortran
GEDCOM
Gherkin
Git
GLSL
GameMaker Language
Go
GraphQL
Groovy
Haml
Handlebars
Haskell
Haxe
HTTP
HTTP Public-Key-Pins
HTTP Strict-Transport-Security
IchigoJam
Icon
Inform 7
INI
IO
J
Jolie
Julia
Keyman
Kotlin
LaTeX
Less
Liquid
Lisp
LiveScript
LOLCODE
Makefile
Markdown
Markup templating
MATLAB
MEL
Mizar
Monkey
N4JS
NASM
nginx
Nim
Nix
NSIS
Objective-C
OCaml
OpenCL
Oz
PARI/GP
Parser
Pascal
Perl
PHP
PHP Extras
PL/SQL
PowerShell
Processing
Prolog
.properties
Protocol Buffers
Pug
Puppet
Pure
Python
Q (kdb+ database)
Qore
R
React JSX
React TSX
Ren'py
Reason
reST (reStructuredText)
Rip
Roboconf
Ruby
Rust
SAS
Sass (Sass)
Sass (Scss)
Scala
Scheme
Smalltalk
Smarty
SQL
Soy (Closure Template)
Stylus
Swift
TAP
Tcl
Textile
Template Toolkit 2
Twig
TypeScript
VB.Net
Velocity
Verilog
VHDL
vim
Visual Basic
WebAssembly
Wiki markup
Xeora
Xojo (REALbasic)
XQuery
YAML
HTML
Paste Expiration :
[Optional]
Never
Self Destroy
10 Minutes
1 Hour
1 Day
1 Week
2 Weeks
1 Month
6 Months
1 Year
Paste Status :
[Optional]
Public
Unlisted
Private (members only)
Password :
[Optional]
Description:
[Optional]
Tags:
[Optional]
Encrypt Paste
(
?
)
Create New Paste
You are currently not logged in, this means you can not edit or delete anything you paste.
Sign Up
or
Login
Site Languages
×
English
Do you like cookies?
🍪 We use cookies to ensure you get the best experience on our website.
Learn more
I agree