#!/usr/local/bin/perl -W -- # $Id: ungoopspam,v 1.4 2002/01/31 09:20:40 chip Exp $ # $Source: /home/chip/src/ungoopspam/RCS/ungoopspam,v $ use HTML::Entities; use MIME::QuotedPrint; # slurp in entire document $_ = join("", <>); # undo quoted-printable (based on dumb heuristic) $_ = decode_qp($_) . "\n" if (/=\n/); # decode &entities; $_ = decode_entities($_); # translate %nn hex escapes (this should be limited to URIs) while (m/%([0-9a-f][0-9a-f])/i) { $t = chr(hex($1)); # print STDERR "changing \"%$1\" -> \"$t\"\n"; s/%$1/$t/gi; } print $_; __END__ =head1 NAME ungoopspam - clarify HTML obfuscations popular with spammers =head1 SYNOPSIS B [file ...] =head1 DESCRIPTION The B utility reads an HTML document from I (or standard input if no files are specified), and performs a number of transformations. =over 4 =item * The document is converted from quoted-printable encoding to plain 8-bit text. This may have happened if the spammer attached the web page using QP format. It's actually a function of the mail system, not HTML. =item * Any HTML entities, such as """ or "A" are converted to their 8-bit character equivalent. =item * Hex encoding ("%nn") in URIs is converted to its 8-bit character equivalent. =back =head1 BUGS This utility is a hack. It is not a rugged tool built for a production environment. It exists to assist you in decoding the obfuscated web pages favored by spammers. Quoted-printable is recognized by a low-rent heuristic. The "%nn" hex character decoding is performed throughout the document, not just in URIs. This may lead to undesired results. =head1 SEE ALSO ascii(7) =head1 AUTHOR Chip Rosenthal Unicom Systems Development $Id: ungoopspam,v 1.4 2002/01/31 09:20:40 chip Exp $ See http://www.unicom.com/sw/ungoopspam/ for latest version.