#! /usr/bin/perl -w

# vim:syntax=perl

use strict;

use lib '/usr/share/perl5';

use Lire::Config;
use Lire::Program qw( :msg tempdir tempfile $PROG $LR_ID );

use Getopt::Long;
use Mail::Address;
use MIME::Parser;
use MIME::WordDecoder;
use File::Path qw/rmtree/;
use File::Copy qw/move/;

my $usage = "Usage: $PROG [-c type]";
my %opts;
GetOptions( \%opts, "content-type=s" )
  or lr_err( $usage );

my $tmpdir = eval { tempdir( "$PROG.$LR_ID.XXXXXX" ) };
lr_err( $@ ) if $@;

my $parser = new MIME::Parser;
$parser->output_dir( $tmpdir );
$parser->extract_uuencode(1);

my $file = @ARGV ? $ARGV[0] : "-";
my $msg = eval { $parser->parse_open( $file ) };
if ($@) {
    # Report error
    my $head = $parser->last_head;
    print "${PROG}_OK=\n";
    eval { print_header_vars( $head ) };
    lr_err( $@ );
}

eval {
    print_header_vars( $msg->head );
    print_attachment( $msg, $opts{'content-type'} );
};
lr_err( $@ ) if $@;
print qq{${PROG}_OK="1"\n};

# Unless we are searching for a specific content-type, 
# the extracted attachment will be the biggest one, so as to
# skip small messages or signatures.
sub print_attachment {
    my ( $msg, $type ) = @_;

    my ($fh, $tmpfile) = tempfile( "$PROG.$LR_ID.XXXXXX" );

    my $target = find_attachment( $msg, $type );
    unless ($target) {
	# Empty body
	print qq{${PROG}_FILE=""\n};
	unlink $tmpfile;
	close $fh;
	return;
    }

    my $body = $target->bodyhandle;

    if ( $body->path ) {
	close $fh;

	# File is on disk, rename the file
	move( $body->path, $tmpfile )
	  or lr_err( "error renaming ", $body->path, " to $tmpfile: $!" );
    } else {
	print $fh $body->as_string;
	close $fh;
    }

    print qq{${PROG}_FILE="$tmpfile"\n};
}

sub find_attachment {
    my ( $ent, $type ) = @_;

    my $size = 0;
    my $part;

    # Iterate through all possible bodies: either the top-level body
    # in the case of a non-multipart message. Or all sub parts.
    my @possible = $ent->parts ? $ent->parts : $ent;
    foreach my $candidate ( @possible ) {
	# Possibly recurse
	if ( $candidate->parts ) {
	    $candidate = find_attachment( $candidate, $type );
	    next unless $candidate;
	}

	if ( $type ) {
	    return $candidate 
	      if ( $candidate->mime_type eq $type );
	} else {
	    # This is the one if it's bigger than the one we already
	    # found. This heuristic may fail in the case that a really
	    # log file smaller than a possible signature is sent, but 
	    # it's already an improvement over always returning the first or 
	    # last attachment.

	    my $body = $candidate->bodyhandle;
	    next unless $body;

	    if ( $body->path ) {
		if ( -s $body->path > $size ) {
		    $size = -s $body->path;
		    $part = $candidate;
		}
	    } else {
		if ( length $body->as_string > $size ) {
		    $size = length $body->as_string;
		    $part = $candidate;
		}
	    }
	}
    }

    return $part;
}

sub extract_email {
    my ( $header ) = @_;
    
    return undef unless defined $header;
    my @addrs = Mail::Address->parse( unmime( $header) );
    return undef unless @addrs;

    # _SUBMITTER is space separated in case of multiple addresses
    return join ' ', map { $_->address } @addrs;
}



sub print_header_vars {
    my ($head) = @_;

    my $from	 = extract_email( $head->get( "from" ) );
    my $reply_to = extract_email( $head->get( "reply-to" ) );
    my $sender	 = extract_email( $head->get( "sender" ) );
    my $to	 = extract_email( $head->get( "to" ) );

    my $submitter = $reply_to ? $reply_to : ( $from ? $from : $sender );

    die "can't determine submitter\n" unless $submitter;
    print qq{${PROG}_SUBMITTER='$submitter'\n};
    print qq{${PROG}_TO='$to'\n};
    
    my $subject = $head->get( "subject" );
    if ( defined $subject ) {
	$subject = unmime( $subject );
	# Remove all non-safe characters: 
	# Safe characters are: 
	#   alphanumeric characters
	#   punctuation and other glyphs: @[](){}*=+-%^:.,;?#|/
	#   spaces: ' ' and \t
	#   Extended Latin 1: 160-255
	$subject =~ s{[^-\w\d\@\Q[](){}*=+-%^:.,;?#/|\E\xA8-\xFF \t]+}{}g; 
	print qq{${PROG}_SUBJECT='$subject'\n};
    } else {
	print qq{${PROG}_SUBJECT=''\n};
    } 

    my $date = $head->get( "date" );
    if ( defined $date ) {
	$date = unmime( $date );
	$date =~ tr/a-zA-Z0-9:+ -//cd;
	print qq{${PROG}_DATE='$date'\n};
    } else {
	print qq{${PROG}_SUBJECT=''\n};
    } 
}

END {
    if ( defined $tmpdir && -d $tmpdir ) {
	if ( Lire::Config->get( 'lr_keep' ) ) {
	    lr_info( "keeping temporary files in $tmpdir on your request" );
	} else {
	    rmtree( $tmpdir, 0, 1 );
	}
    }
}

# Local Variables:
# mode: cperl
# End:

__END__

=pod

=head1 NAME

B<lr_getbody> - Extract file, sender and subject information from an email

=head1 SYNOPSIS 

eval `B<lr_getbody> [B<-c I<content-type>>] <messagefile>`

=head1 DESCRIPTION

B<lr_getbody> reads an email message on stdin (or specified as
argument), will extract the submitter address and the subject from
that email. It will also extract the biggest attachment or the first
one having a content-type matching the I<content-type> argument.

This information is passed to the caller by printing that information
in a format suitable for eval. The following variable will be printed
on STDOUT:

=over 4

=item lr_getbody_SUBMITTER

Email address of the submitter. That is the first email that is found
in the headers (searched in the following order): Reply-To, From,
Sender. This will only contains the address portion (without the <>).

=item lr_getbody_TO

The email address to which the email was sent.

=item lr_getbody_SUBJECT

Subject of the message. That string is sanitized.

=item lr_getbody_DATE

The date of the message. That string is sanitized.

=item lr_getbody_FILE

Path to the temporary file holding the found attachment.

=item lr_getbody_OK

This will be set if the operation was completed successfully. It is
possible for B<lr_getbody> to fail to extract the attachment, but the
header information might still be available.

=back

This script is used by lr_processmail(1) and lr_rawmail2mail(1).

=head1 VERSION

$Id: lr_getbody.in,v 1.10 2006/07/23 13:16:33 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2002 Stichting LogReport Foundation LogReport@LogReport.org

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.

=head1 AUTHOR

Francis J. Lacoste <flacoste@logreport.org>

=cut

# Local Variables:
# mode: cperl
# End:
