#!/usr/bin/env python3
import argparse, hashlib, os, re, sys, time, zipfile, csv

ALLOWED_HASH_TYPES = ['HEMMD5', 'HEMSHA1', 'HEMSHA2']

OUTPUT_COLUMNS = [
    'Customer ID', 'Address Line 1', 'Address Line 2', 'City', 'State',
    'ZIP Code', 'ZIP11', 'MAID', 'HEM', 'HEMType', 'IP Address'
]

EMAIL_CANDIDATES = {
    'email', 'e-mail', 'emailaddress', 'email address', 'e mail', 'e_mail'
}

EMAIL_REGEX = re.compile(r'^(?:.+)@(?:.+?)\.(.+)$')

def is_valid_email_address(s: str) -> bool:
    if not s: return False
    return bool(EMAIL_REGEX.match(s.strip()))

def compute_hash(t: str, email: str) -> str:
    e = email.encode('utf-8')
    return (
        hashlib.md5(e).hexdigest()  if t == 'HEMMD5'  else
        hashlib.sha1(e).hexdigest() if t == 'HEMSHA1' else
        hashlib.sha256(e).hexdigest() if t == 'HEMSHA2' else ""
    )

def normalize(name: str) -> str:
    """Lowercase, strip BOM/whitespace (incl. non-breaking), remove non-alnum."""
    if name is None: return ''
    name = name.replace('\ufeff', '').replace('\u00A0', ' ').strip()
    return ''.join(ch for ch in name.lower() if ch.isalnum())

INPUT_ALIASES = {
    'Customer ID': {'customer id', 'customer_id', 'cust id', 'custid', 'id'},
    'Address Line 1': {'address line 1', 'address1', 'address_line_1', 'addr1', 'street', 'street1'},
    'Address Line 2': {'address line 2', 'address2', 'address_line_2', 'addr2', 'street2'},
    'City': {'city', 'town'},
    'State': {'state', 'region', 'province', 'st'},
    'ZIP Code': {'zip code', 'zip', 'zipcode', 'postal code', 'postalcode'},
    'ZIP11': {'zip11', 'zip+4', 'zip_11', 'zipplus4'},
    'MAID': {'maid', 'mobile ad id', 'mobileadid', 'advertising id', 'adid', 'idfa', 'gaid'},
    'IP Address': {'ip address', 'ip', 'ip_address'},
}

def parse_args(argv):
    p = argparse.ArgumentParser(description="Read a CSV/TSV and hash the email addresses.")
    p.add_argument("file", metavar="csv-file", help="input csv/tsv file with email addresses")
    p.add_argument("type", metavar="hash-type", help="HEMMD5 | HEMSHA1 | HEMSHA2")
    p.add_argument("--compress", action="store_true", help="compress the output into a ZIP archive")
    p.add_argument("--silent", action="store_true", help="run in silent mode (no prompts/summary)")
    return p.parse_args(argv)

def detect_dialect(path):
    # Prefer tab if .tsv
    if path.lower().endswith(".tsv"):
        class D: delimiter = '\t'; quotechar = '"'
        return D()
    # Try sniffing
    with open(path, "r", encoding="utf-8-sig", newline='') as f:
        sample = f.read(4096)
    sniffer = csv.Sniffer()
    try:
        dialect = sniffer.sniff(sample, delimiters=[',','\t',';','|'])
        # Ensure header exists (some very short files confuse sniffer)
        if not sniffer.has_header(sample):
            # fallback to comma
            class D: delimiter = ','; quotechar = '"'
            return D()
        return dialect
    except csv.Error:
        # Fallback: if sample looks tabby, pick tab; else comma
        delim = '\t' if '\t' in sample and sample.count('\t') >= sample.count(',') else ','
        class D: pass
        setattr(D, 'delimiter', delim); setattr(D, 'quotechar', '"')
        return D()

def main():
    args = parse_args(sys.argv[1:])
    hash_type = args.type.upper()
    if hash_type not in ALLOWED_HASH_TYPES:
        sys.exit("Error: use HEMMD5, HEMSHA1, or HEMSHA2.")

    in_file = args.file
    base, ext = os.path.splitext(os.path.basename(in_file))
    out_csv = f"{base}_hashed_{hash_type.lower()}{(ext if ext else '.csv')}"
    out_zip = f"{base}_hashed_{hash_type.lower()}.zip"
    output_file = out_zip if args.compress else out_csv

    if not args.silent and os.path.isfile(output_file):
        ans = input(f"The output file {output_file} exists and will be overwritten\n Proceed? (yes/no): ")
        if ans.lower() not in ("yes","y"): sys.exit()

    if not args.silent:
        print("Please wait, hashing email addresses…")

    start = time.time(); processed = 0

    try:
        dialect = detect_dialect(in_file)
        with open(in_file, "r", newline='', encoding="utf-8-sig") as f_in, \
                open(out_csv, "w", newline='', encoding="utf-8") as f_out:

            reader = csv.DictReader(f_in, delimiter=dialect.delimiter, quotechar=getattr(dialect,'quotechar','"'))
            if not reader.fieldnames:
                sys.exit(f"Error: No headers found in {os.path.abspath(in_file)}")

            # Clean incoming headers and map normalized -> original
            cleaned_headers = [(h.replace('\ufeff','').replace('\u00A0',' ').strip()) for h in reader.fieldnames]
            reader.fieldnames = cleaned_headers  # ensure DictReader uses cleaned keys
            in_headers_norm = {normalize(h): h for h in cleaned_headers}

            # Find email column
            email_key = None
            for cand in EMAIL_CANDIDATES:
                k = normalize(cand)
                if k in in_headers_norm:
                    email_key = in_headers_norm[k]; break
            if not email_key:
                # fall back to anything that normalizes exactly to 'email'
                for h in cleaned_headers:
                    if normalize(h) == 'email':
                        email_key = h; break
            if not email_key:
                sys.exit("Error: Unable to find an email column (e.g., 'Email' or 'Email Address').")

            # Build passthrough mapping for output columns (except HEM/HEMType)
            passthrough = {}
            for out_col in OUTPUT_COLUMNS:
                if out_col in ('HEM','HEMType'): continue
                n = normalize(out_col)
                if n in in_headers_norm:
                    passthrough[out_col] = in_headers_norm[n]; continue
                for alias in INPUT_ALIASES.get(out_col, set()):
                    na = normalize(alias)
                    if na in in_headers_norm:
                        passthrough[out_col] = in_headers_norm[na]; break

            writer = csv.DictWriter(f_out, fieldnames=OUTPUT_COLUMNS, lineterminator='\n')
            writer.writeheader()

            for row in reader:
                # Make sure row keys match cleaned headers
                row = { (k.replace('\ufeff','').replace('\u00A0',' ').strip() if isinstance(k,str) else k): v
                        for k,v in row.items() }

                email_value = (row.get(email_key) or "").strip().lower()
                if not is_valid_email_address(email_value):
                    continue

                out_row = {c: "" for c in OUTPUT_COLUMNS}
                for out_col, in_col in passthrough.items():
                    out_row[out_col] = (row.get(in_col) or "").strip()

                out_row['HEM'] = compute_hash(hash_type, email_value)
                out_row['HEMType'] = hash_type

                writer.writerow(out_row)
                processed += 1

    except FileNotFoundError:
        sys.exit(f"File {in_file} not found. Please check the file path.")
    except IOError as e:
        sys.exit(f"I/O error: {e}")

    if args.compress:
        with zipfile.ZipFile(out_zip, 'w', zipfile.ZIP_DEFLATED) as zf:
            zf.write(out_csv)

    if not args.silent:
        print(f"Hashed {processed} email addresses in {time.time()-start:0.2f}s using {hash_type} to {output_file}")

if __name__ == "__main__":
    main()
