#!/usr/local/bin/perl5 -wT

# Copyright (c) 2000                            RIPE NCC
#
# All Rights Reserved
#
# Permission to use, copy, modify, and distribute this software and its
# documentation for any purpose and without fee is hereby granted,
# provided that the above copyright notice appear in all copies and that
# both that copyright notice and this permission notice appear in
# supporting documentation, and that the name of the author not be
# used in advertising or publicity pertaining to distribution of the
# software without specific, written prior permission.
#
# THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
# ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS; IN NO EVENT SHALL
# AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# Lee Wilmot 20020624

# Filter: replace/mask all email address from a file
# Input : STDIN
# Output: STDOUT
#
# The purpose is to avoid trawling of email 
# addresses from the document. Special attention
# is paid to mhonarc-generated files, but the script
# should work fine for all.
#
# Emails in <a href> tags are replaced with forms
# pointing to a CGI script which can descramble
# the scrambled addresses.
#
# The domain names of all other emails are replaced
# in a not-recoverable way.
#
# --- Notes ---
#
# - apologies that the three nasty regex's aren't /x, it
#   interfered with the match
#
# --- Known Bugs ---
#
# - nested hrefs will be corrupted
# - multiple mh addresses in a mail header field will not
#   appear on the same line in most browsers

# ------------------- CONFIGURATION ----------------------- #

use HTML::Entities;

# mailto: emails can be recovered by feeding the
# form values to this script. The path is
# relative to the document root on the webserver

my $DESCRAMBLE_CGI_PATH = '/cgi-bin/descramble.pl';

# 'Special' characters in an email address according
# to rfc822 

my $EMAIL_SPECIALS = &quote_regex_meta( '\()<>@,;:.[]"' );

# Any email-like things found after the mailto's are replaced,
# the domain is set to this

my $REPLACEMENT_DOMAIN = 'localhost';

# ----------------------- START --------------------------- #

# Slurp in whole doc at once

undef $/;

my $doc = <>;

# Each page can have multiple forms, we need to give
# each one a unique name. I chose to base it on a
# counter. A global variable is needed because all
# replacements are done at once.

my $FORM_COUNT = 1;

# --------------------- STEP ONE ------------------------- #

# Look for mail headers as produced by mhonarc
# We need to treat these specially, as the real name of
# the person is -before- the mailto: field (duh)

# Can't do /x, the whitespace gets in the way of the match
# Format: mh-field, name, email

if ( $doc =~ /<!-- MHonArc / ) {
    $doc =~ s/&quot;/"/g;
    $doc =~ s/<li>(<em>(?:From|To|Cc|Reply\-to|Resent\-from|Resent\-sender)<\/em>:\s*.*\s*<A HREF="mailto:(?:[^"]+)">.*<\/A>\s*.*)<\/li>\s*\n/&scramble_mh_line($1);/eig;
}

# --------------------- STEP TWO ------------------------- #

# Match and replace all remaining mailto:'s

$doc =~ s/(<a\s+[^>]*?href="?\s*mailto\s*:.*?<\/a>)/&scramble_normal($1)/eig;

# -------------------- STEP THREE ----------------------- #

# Finally, mask all remaining email-addressey-things

# Look for roughly...
#
#    NDCS@(NDCS+.)*(NDCS+)
#
# (NDCS = Sequence of not-domain-characters)
# Replace with $REPLACEMENT_DOMAIN after the @
#

$doc =~ s/([^\s$EMAIL_SPECIALS]+)\\?@(?:[^\s$EMAIL_SPECIALS]+\.)*(?:[^\s$EMAIL_SPECIALS]+)/$1\@$REPLACEMENT_DOMAIN/g;

# Send out the filtered result

print $doc;

exit 0;

# ----------------------- SUBROUTINES --------------------------- #

# Purpose           : make a form with a scrambled version of an email address
# In: $name         : the 'name' part of an email address (e.g. Lee Wilmot)
# In: $email        : the email address
# In: $start_of_form: text that should go before the name in the form
# In: $end_of_form  : text that should go after the name in the form
# Out: $scrambled   : the result
#
# Method            : we make a 'post' form and split the localname and domain
#                     labels into separate values on the form
#
sub scramble
{
    my ( $name, $email, $start_of_form, $end_of_form ) = @_;

    # If there is no name, make sure it's '' rather than undef

    $name = $name || '';

    my ( $scrambled, $rest_of_mailto );

    # Sometimes there's more than just an email address in the
    # mailto: the rest is separated by a space

    if ( $email =~ /(.*?)\?(.*)/ ) {
	( $email, $rest_of_mailto ) = ( $1, $2 );
    }

    if (
	$email =~ /^
	   (.+)
	   \@
	   ([^\@]+)
	$
	/x
    ) {
	my ( $localname, $domain ) = ( $1, $2 );

	$scrambled = '';

	# Start form

	$scrambled .= "<form method=\"POST\" action=\"$DESCRAMBLE_CGI_PATH\" name=\"f$FORM_COUNT\">";

	# Text at beginning of form if any was specified

	$scrambled .= $start_of_form if ( $start_of_form );

	# Hidden field with the local name

	$scrambled .= "<input type=\"hidden\" name=\"localname\" value=\"$localname\" />";

	# A hidden field for each label in $domainname
	# Note: they're added in reverse
	# The name of the input is just a number

        my $domain_counter = 1;

	my @domains = split /\./, $domain;

	while ( my $domain = pop @domains ) {
	    $scrambled .= "<input type=\"hidden\" name=\"$domain_counter\" value=\"$domain\" />";
	    $domain_counter++;
        }

	# Any other attributes e.g. subject

	$scrambled .= "<input type=\"hidden\" name=\"other\" value=\"$rest_of_mailto\" />"
	    if ( defined $rest_of_mailto );

	# We can't do HTML markup when a button name will appear, remove
	# all HTML tags and entities

	my $non_javascript_name = $name;
	$non_javascript_name =~ s/<.*?>//g;
	$non_javascript_name = decode_entities( $non_javascript_name );

	# Submit
	# With javascript, it's an href, without it's a button
	
	$scrambled .= <<"        EOF";
	    <script language=\"javascript\">
	     <!--
		document.write(\"<a href=\\\"javascript:document.f$FORM_COUNT.submit()\\\">$name</a>\")            
	    // -->
	    </script>
	    <noscript>
		<input type=\"submit\" value=\"$non_javascript_name\" />
	    </noscript>
        EOF

	# Text at end of form if any was specified

	$scrambled .= $end_of_form if ( $end_of_form );

	$scrambled .= '</form>';

	$FORM_COUNT++;
    }

    else {
	return $name;
    }

    return $scrambled;
}

# Purpose: scramble a mailto HREF lying around anywhere in a document
# In:      $href: the whole <A> tag up to and including </A>
# Out:     $href: the scrambled version
#

sub scramble_mh_line
{
    my $line = shift;

    $line =~ /^<em>(From|To|Cc|Reply\-To|Resent\-from|Resent\-sender)<\/em>:\s*(.*?)$/i;
	      
    my ( $mh_field, $rest_of_mh_line ) = ( $1, $2 );

    # Highlight the first char of the From etc, it looks
    # nicer

    $mh_field = ucfirst( $mh_field );

    my @scrambled_emails;

    # We can't just split on ',', because some people include it in their
    # fullname part. Therefore, we rely on mhonarc's ability to split the
    # header correctly, and match against it's possiblities for the bit
    # before a ',' which actually separates the recipients
    # i.e. preceeded by </A> | " | &gt;

    # Yes, I agree this can conceivably fail 

    my $weird_string = 'PLEASESPLITONEMEILOVEIT';

    $rest_of_mh_line =~ s/(<\/A>|&gt;|")\s*,/$1$weird_string/ig;

    foreach my $mh_address (  split /$weird_string/, $rest_of_mh_line ) {

	# Pickout the email address, and the bits before and after it

	if ( $mh_address =~ /\s*(.*)\s*<A HREF="mailto:([^"]+)">[^<]+<\/A>\s*(.*)\s*$/i ) {

	    my ( $prelim, $the_email, $postlim ) = ( ( $1 || ''), $2, ( $3 || '' ) );

	    # Remove email addresses from the markup before and after the actual
	    # email address

	    $prelim =~ s/\s*<A HREF="mailto:(?:[^"]+)">(.*)<\/A>/$1/ig;
  	    $postlim =~ s/\s*<A HREF="mailto:(?:[^"]+)">(.*)<\/A>/$1/ig;
					     
	    # Replace the From etc if this is the first email address we're
	    # converting

	    $prelim = "<em>$mh_field</em>: " .$prelim
		    if ( @scrambled_emails == 0 );

	    # Now scramble it

            $mh_address = &scramble( $the_email, $the_email, $prelim, $postlim );
	}

	push @scrambled_emails, $mh_address;
    }

    my $scrambled_emails = join '', @scrambled_emails;

    return "<li>$scrambled_emails</li>", 
}

# Purpose: scramble a mailto HREF lying around anywhere in a document
# In:      $href: the whole <A> tag up to and including </A>
# Out:     $href: the scrambled version
#
sub scramble_normal
{
    my $href = shift;

    if ( $href =~ /
	<a\s+[^>]*                  # start of tag
	href=
	(?:"\s*mailto\s*:\s*([^">]+)"|\s*mailto\s*:\s*([^\s"]+)[\s"])  # mailto either with or without quotes
        [^>]*>                      # to end of <a> tag
        (.*)                        # the name of receiver (i.e. between <a> and <\/a>)
        <\s*\/\s*a\s*>$             #  <\/a>
     /xi
 ) {
      my ( $email, $name ) = ( $1 || $2, $3 );

      return &scramble( $name, $email );
    }
    else {
       return $href;
    }
}

# Purpose: backslash all regex special characters in
#          the passed string

sub quote_regex_meta
{
    my $to_quote = shift;

    $to_quote =~ s/([\\|\(\)\[\]\{\}\^\$\*\+\?\.])/\\$1/g;

    return $to_quote;
}
