#! /usr/bin/perl -w

# vim:syntax=perl

use strict;
use lib '/usr/share/perl5';
use Lire::DlfSchema;
use Lire::Email;
use Lire::Program qw( :msg :dlf );

# 1 is unassigned
# my $debug_print_dlf = 2;
my $debug_sanitize = 4;

# $debug is bitmask
#
# my $debug = $debug_sanitize;
my $debug = 0;

my $schema = eval { Lire::DlfSchema::load_schema( "email" ) };
lr_err( "failed to load email schema: $@" ) if $@;
my $dlf_maker = 
  $schema->make_hashref2asciidlf_func( qw/time logrelay queueid msgid 
		      from_user from_domain from_relay_host from_relay_ip 
		      size delay xdelay
		      to_user to_domain to_relay_host to_relay_ip 
		      stat xstat
				       /);

my $lines	= 0;
my $dlflines	= 0;
my $errorlines  = 0;

# keep track of being processed dlfid's (a dlfid is a hostname . queueid )
my %msg         = ();

#
# merge hashes old and current in old
#

sub parse_sendmail_log {
    my ( $dlfid, $log ) = @_;

    my $time = $log->{timestamp};

    if (defined $log->{'from'}) {
	# Skip SYSERR line
	# XXX: Does this send an email. We get nrcpts=1 after those.
	# Maybe we should set to_user in this case.
	return if $log->{content} =~ / SYSERR\(root\):/;

	my $cur = $msg{$dlfid} ||= {
				    time      => $time,
				    logrelay  => $log->{hostname},
				    queueid   => $log->{queueid},
				   };

	my $from;
	sanitize('emailadress', $log->{from}, $from );
	($cur->{from_user}, $cur->{from_domain}) = splitemailadress( $from );

	die "fromline without a relay field\n"
	  unless defined $log->{relay};
	sanitize('relay', $log->{relay}, $log->{relay});

	my ( $fromrelayhost, $fromrelayip) = splitrelay( $log->{relay} );
	sanitize('relayhost', $fromrelayhost, $cur->{from_relay_host});
	sanitize('relayip', $fromrelayip,     $cur->{from_relay_ip});

	die "fromline without a size field\n"
	  unless defined $log->{size};
	$cur->{size} = $log->{size};

	# It is possible that nrcpts will be set to 0. This happens for
	# from line following check_*. But it can also happens on other
	# "normal" occasion for reason I don't know. Those will only be output
	# at the end.
	die "fromline without a nrcpts field\n"
	  unless defined $log->{nrcpts};
	$cur->{nrcpts} = $log->{nrcpts} || -1; # -1 = Guard value

	if ( $log->{msgid} ) {
	    sanitize('msgid', $log->{msgid}, $cur->{msgid});
	} else {
	    lr_debug( "interesting line '$_': fromline without a msgid")
	      if $debug;
	}
    } elsif (defined $log->{'to'}) {
        lr_debug "gonna sanitize_tos " . $log->{'to'}  
	  if $debug & $debug_sanitize;

	my $cur  = $msg{$dlfid};
	unless ( $cur ) {
	    $cur = $msg{$dlfid} = {
				   time	     => $time,
				   logrelay  => $log->{hostname},
				   queueid   => $log->{queueid},
				   nrcpts    => -1, # Guard value
				  };
	}
        # cur{'tos'} is gonna point to an array of adresses
	my $tos = sanitize_tos($log->{to});

        lr_debug "sanitized tos to '@$tos'" 
	  if $debug & $debug_sanitize;

        # the following code snippet would yield a from address, even when
        # none is found in a regular from= line, based upon the ctladdr in
        # a to= line.  We have seen lots of logs where ctladdr is the actual
        # sending user.  E.g.
        #
        # PAA01703: from=<ben@example.com>
        # PAA01703: to=<foo@example.nl>, ctladdr=<ben@example.com>
        #
        # However:
        # ' to="|exec /usr/local/bin/procmail", ctladdr=joe@droopey '
        # means joe@droopey is the _receiving_ user.  Therefore, we will no
        # longer use this trick: apparently, contents of ctladdr doesn't give
        # any reliable information.... :(

        # if ( exists $log->{ctladdr} && ! $cur->{from_user} ) {
        #     # Determine from_user based on the controlling address
        #    my $from;
        #    sanitize('emailadress', $log->{ctladdr}, $from );
        #
        #    ($cur->{from_user}, $cur->{from_domain}) = splitemailadress($from);
        # }


	foreach my $to ( @$tos ) {
	    my $del = $cur->{deliveries}{$to};
	    
	    unless ( $del ) {
		$del = $cur->{deliveries}{$to} = {};
		($del->{to_user}, $del->{to_domain}) = splitemailadress($to);
	    }
	    
	    if (defined $log->{'relay'}) {
		my $relay;
		sanitize('relay', $log->{'relay'}, $relay );

		my ( $torelayhost, $torelayip) = splitrelay( $relay );
		sanitize('relayhost', $torelayhost, $del->{to_relay_host});
		sanitize('relayip', $torelayip, $del->{to_relay_ip});
	    } elsif (defined $log->{'mailer'} &&
		     ( $log->{'mailer'} eq 'local' || 
		       $log->{'mailer'} eq 'prog') ) 
	    {
		my ( $torelayhost, $torelayip) = splitrelay("localhost");
		sanitize('relayhost', $torelayhost, $del->{to_relay_host});
		sanitize('relayip', $torelayip, $del->{to_relay_ip});
	    }
	    
	    for my $flag ('delay', 'xdelay') {
		if (defined $log->{$flag}) {
		    sanitize('delay', $log->{$flag}, $del->{$flag});
		    lr_debug( "sanitized $flag '", $log->{$flag},
			      "' to '", $del->{$flag}, "'" )
		      if $debug & $debug_sanitize;
		}
	    }

	    die "to-line without a stat field\n"
	      unless defined $log->{stat};

	    my $stat;
	    sanitize( 'stat', $log->{'stat'}, $stat);
	    ($del->{stat}, $del->{xstat}) = splitstat( $stat );

	    if ( $del->{stat} ne "queued" && $del->{stat} ne 'deferred' ) {
		$dlflines += print_dlf( $cur, $dlf_maker, \%msg, $to );
	    }
	}
    } elsif ( defined $log->{ruleset} ) {
	my $cur = $msg{$dlfid} ||= {
				    time      => $time,
				    logrelay  => $log->{hostname},
				    queueid   => $log->{queueid},
				   };
	
	sanitize( 'stat', $log->{ruleset}, $cur->{stat});
	sanitize( 'stat', $log->{reject},  $cur->{xstat});
    } elsif ( $log->{content} =~ /^([0-9a-zA-Z]{12}): ([0-9a-zA-Z]{12}):/ ) {
	# Handle DSN, sender notify, postmaster notify, etc.
	# Oct 31 14:10:32 talyn sendmail[29217]: f9VNAM129215: f9VNAW129217:
	#     postmaster notify: User unknown
	# Oct 31 14:10:33 talyn sendmail[29217]: f9VNAW129217: to=root, 
	#     delay=00:00:01, xdelay=00:00:00, mailer=local, pri=33136, 
	#     dsn=2.0.0, stat=Sent
	# Nov 1 11:30:15 talyn sendmail[1982]: fA1KUDK01980: fA1KUFK01982: 
	#     DSN: Return receipt
	# Nov 1 11:30:15 talyn sendmail[1982]: fA1KUFK01982: 
	#     to=<john.doe@example.mail.com>, delay=00:00:00, 
	#     xdelay=00:00:00, mailer=esmtp, pri=30100, relay=[10.20.4.42]
	#     [10.20.4.42], dsn=2.0.0, stat=Sent (OK)
	$dlfid = $log->{hostname} . $2;
	my $new = $msg{$dlfid} = { logrelay => $log->{hostname},
				   queueid  => $2,
				   time	    => $time,
				   nrcpts   => -1, # Guard value
				 };
	sanitize('emailadress', "<>", $new->{from} );

	($new->{from_user}, $new->{from_domain}) =
	  splitemailadress( $new->{from} );

	sanitize('relay', "localhost", $new->{relay});

	my ( $fromrelayhost, $fromrelayip) = splitrelay( $new->{relay} );
	sanitize('relayhost', $fromrelayhost, $new->{from_relay_host});
	sanitize('relayip', $fromrelayip,     $new->{from_relay_ip});
    } else {
        # we don't wanna hear about this. happens quite often in typical
        # logfiles.
        # lr_warn "'$_': to- nor from-line: skipping\n";
    }
}

# We don't sort on Queuid because there is no warranty that 
# the queueid of related messages will sort in order. Sendmail
# will use lines like OLD_QUEUEID : NEW_QUEUEID for bounce and DSN message
# and we NEW_QUEUEID could sort before OLD_QUEUEID and some information about
# NEW_QUEUEID would be missing.
my $parser = new Lire::Email();
init_dlf_converter( "email" );
while ( <> ) {
    lire_chomp;
    $lines++;
    lr_debug "next line '$_'" if $debug;

    my $log;
    eval {
	$log = $parser->parse($_);
    };
    if ( $@ ) {
	lr_warn( $@ );
	$errorlines++;
	next;
    }

    my $dlfid;
    if (defined $log->{'queueid'}) {
        if (defined $log->{'hostname'}) {
            $dlfid = $log->{'hostname'} . $log->{'queueid'};
        } else {
	    $log->{'hostname'} = '';
            $dlfid = $log->{'queueid'};
        }
    } else {
        # we don't wanna hear about this.
        # lr_warn "skipping line '$_': no queueid\n";
        next;
    }

    eval {
	parse_sendmail_log( $dlfid, $log );
    };
    if ( $@ ) {
	lr_warn( $@ );
	lr_warn( "error parsing line '$_' as a sendmail log. Skipping" );
	$errorlines++;
	next;
    }
}

# flush other messages
foreach my $dlfid ( keys %msg ) {
    $dlflines += print_dlf( $msg{$dlfid}, $dlf_maker, \%msg );
}

end_dlf_converter( $lines, $dlflines, $errorlines );

__END__

=pod

=head1 NAME

sendmail2dlf - convert sendmail logfiles to dlf

=head1 SYNOPSIS

B<sendmail2dlf>

=head1 DESCRIPTION

sendmail2dlf(1) converts a LogLevel 9 sendmail (8.10.x or higher) logfile, as
created using syslog, to a Lire email Distilled Log Format file.

Input is one line per event.  Outputted is one line per delivery:

 time logrelay queueid msgid fromuser fromdomain fromrelay \
 size delay xdelay touser todomain torelay stat

(This should be the format as defined in email/dlf.cfg.)

=head1 EXAMPLE

The lines 

 Apr 20 03:00:11 firewall sendmail[442]: DAA00442: \
   from=<user@example.com>, size=4992, class=0, \
   pri=34992, nrcpts=1, \
   msgid=<200004192316.BAA19611@achilles.noot.com>, \
   proto=ESMTP, relay=host.example.nl [150.0.0.45]
 Apr 20 03:00:11 firewall sendmail[442]: DAA00442: \
   to=<jan@aap.com>, delay=00:00:00, mailer=smtp, \
   stat=queued
 Apr 20 05:00:11 firewall sendmail[503]: DAA00442: \
   to=<jan@aap.com>, delay=02:00:00, \
   xdelay=00:00:03, mailer=smtp, relay=mailgw.aap.com. \
   [3.4.64.199], stat=Sent (OK id=12i7CN-0001Kv-00)

wil be converted to

 956109611 firewall DAA00442 \
   <200004192316.baa19611@achilles.noot.com> user \
   example.com host.example.nl_[150.0.0.45] 4992 0 0 \
   jan aap.com host.example.nl._[150.0.0.45] queued \
   UNKNOWN
 956116811 firewall DAA00442 \
   <200004192316.baa19611@achilles.noot.com> user \
   example.com host.example.nl_[150.0.0.45] 4992 \
   7200 3 jan aap.com mailgw.aap.com._[3.4.64.199] \
   sent (ok_id=12i7cn-0001kv-00)

The lines

 Mar 17 13:34:32 mailhost sendmail[8408]: NAA08408: \
  from=<piet@example.com>, size=1890, class=0, \
  pri=0, nrcpts=4, \
  msgid=<000b01bf9009$f6885b20$6c062014@sabepc06.be.example.com>, \
  proto=ESMTP, relay=root@[1.2.6.10]
 Mar 17 13:45:26 mailhost sendmail[8457]: NAA08408: \
  to=lkrksen@www, delay=00:10:56, xdelay=00:00:01, \
  mailer=smtp, relay=www.example.nl. [194.229.43.3], \
  stat=Sent (NAA06261 Message accepted for delivery) \
 Mar 17 13:45:27 mailhost sendmail[8457]: NAA08408: \
  to=ll@host.example.com, delay=00:10:57, \
  xdelay=00:00:01, mailer=smtp, relay=host.example.nl. \
  [150.0.0.45], stat=Sent (OK)
 Mar 17 13:45:31 mailhost sendmail[8457]: NAA08408: \
  to=<mvelsla@aap.com>,<pvhove@aap.com>,<pdebaerd@aap.com>, \
  delay=00:11:01, xdelay=00:00:04, mailer=smtp, \
  relay=mailgw.aap.com. [3.4.64.199], stat=Sent (OK \
  id=12Vw8J-0001iT-00)

will be converted to

 953210726 mailhost NAA08408 \
  <000b01bf9009$f6885b20$6c062014@sabepc06.be.example.com>\
  piet example.com root@[1.2.6.10] 1890 656 1 lkrksen \
  www www.example.nl._[194.229.43.3] sent \
  (naa06261_message_accepted_for_delivery)
 953210727 mailhost NAA08408 \
  <000b01bf9009$f6885b20$6c062014@sabepc06.be.example.com> \
  piet example.com root@[1.2.6.10] 1890 657 1 ll \
  host.example.com host.example.nl._[150.0.0.45] sent (ok)
 953210731 mailhost NAA08408 \
  <000b01bf9009$f6885b20$6c062014@sabepc06.be.example.com> \
  piet example.com root@[1.2.6.10] 1890 661 4 mvelsla \
  aap.com mailgw.aap.com._[3.4.64.199] sent \
  (ok_id=12vw8j-0001it-00)
 953210731 mailhost NAA08408 \
  <000b01bf9009$f6885b20$6c062014@sabepc06.be.example.com> \
  piet example.com root@[1.2.6.10] 1890 661 4 pvhove \
  aap.com mailgw.aap.com._[3.4.64.199] sent \
  (ok_id=12vw8j-0001it-00)
 953210731 mailhost NAA08408 \
  <000b01bf9009$f6885b20$6c062014@sabepc06.be.example.com> \
  piet example.com root@[1.2.6.10] 1890 661 4 pdebaerd \
  aap.com mailgw.aap.com._[3.4.64.199] sent \
  (ok_id=12vw8j-0001it-00)

The lines

 Mar 15 13:34:09 firewall sendmail[279]: NAA00279: \
  from=<klaas@example.com>, size=2281952, class=0, \
  pri=2311952, nrcpts=1, \
  msgid=<200003151230.NAA00112@mailhost.example.nl>, \
  proto=ESMTP, relay=host.example.nl [150.0.0.45]
 Mar 15 13:34:09 firewall sendmail[279]: NAA00279: \
  to=<klaas@hotmail.com>, delay=00:00:04, mailer=smtp, \
  stat=queued
 Mar 15 13:39:58 firewall sendmail[401]: NAA00279: \
  to=<klaas@hotmail.com>, delay=00:05:53, xdelay=00:00:06, \
  mailer=smtp, relay=mc5.law5.hotmail.com. \
  [216.32.243.136], stat=Service unavailable
 Mar 15 13:39:58 firewall sendmail[401]: NAA00279: \
  NAA00401: postmaster notify: Service unavailable
 Mar 15 13:40:04 firewall sendmail[401]: NAA00401: \
  to=klaas@host.example.com, delay=00:00:06, \
  xdelay=00:00:04, mailer=smtp, relay=host.example.nl. \
  [150.0.0.45], stat=Sent (OK)

will be converted to

 953037249 firewall NAA00279 \
  <200003151230.naa00112@mailhost.example.nl> klaas \
  example.com host.example.nl_[150.0.0.45] 2281952 4 1 \
  klaas hotmail.com mailgw.csc.com._[208.219.64.199] \
  queued UNKNOWN
 953037598 firewall NAA00279 \
  <200003151230.naa00112@mailhost.example.nl> klaas \
  example.com host.example.nl_[150.0.0.45] 2281952 353 6 \
  klaas hotmail.com mc5.law5.hotmail.com._[216.32.243.136] \
  service unavailable

The fact that the delivery 'Mar 15 13:40:04 firewall sendmail[401]: 
NAA00401:' does not generate a dlf record is a bug.

When the line

 Mar 15 19:39:40 mailhost sendmail[2178]: TAA02178: \
  from=<foo@hotmail.com>, size=0, class=0, pri=0, \
  nrcpts=0, proto=SMTP, relay=[1.84.7.150]

occurs in the input, and there is no line carrying the same queueid, the 
line is discarded, and reported as skipped: any to- or from- line, lacking 
any partner, will get discarded.

Lines like:

 Mar 15 13:40:19 firewall sendmail[456]: alias database \
  /etc/aliases.db out of date

wil get discarded

=head1 EXAMPLES

To process a log as produced by sendmail:

 $ sendmail2dlf < mail.log

sendmail2dlf will be rarely used on its own, but is more likely
called by lr_log2report:

 $ lr_log2report sendmail < /var/log/maillog

=head1 BUGS

When queueids are being reused within one logfile, behaviour is unpredictable.
Incomplete logsnippets (e.g. from-lines without to-lines) are not treated well.

It is reported events like this occur in sendmail log files:

 SAA14845: from=<>, size=146990, class=0, pri=176990, nrcpts=1, 
  msgid=<092246.09986@rly-yc03.mx.aol.com>, proto=ESMTP, 
  relay=omr-d06.mx.aol.com [205.188.156.71]
 SAA14845: to=joe@mailhost, ctladdr=<joe@example.edu>, delay=00:00:01, 
  mailer=local, stat=User unknown
 SAA14845: to=<joe@example.edu>, delay=00:00:01, mailer=local, 
  stat=User unknown
 SAA14845: SAA14846: postmaster notify: User unknown
 SAA14846: to="|exec /usr/local/bin/procmail", ctladdr=ann@mailhost (2217/10), 
  delay=00:00:00, xdelay=00:00:00, mailer=prog, stat=Sent
 SAA14846: to=bob@imap-ns, delay=00:00:01, xdelay=00:00:01, mailer=esmtp, 
  relay=apex.example.edu. [152.19.4.80], 
  stat=Sent (Message received: GVV8N400.CMX)
 SAA14846: to=eve@mailhost, delay=00:00:01, xdelay=00:00:00, mailer=local, 
  stat=Sent

Note that SAA14845 has _two_ final to= lines, while the from= line states
nrcpts=1.  This blows the axiom of this script away.  We haven't decided yet on
how to deal with this...

=head1 THANKS

Edward Eldred, for finding and reporting a bug.

=head1 VERSION

$Id: sendmail2dlf.in,v 1.32 2006/07/23 13:16:34 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2000, 2001, 2002 Stichting LogReport Foundation LogReport@LogReport.org

This program is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html.

=head1 AUTHOR

Joost van Baal <joostvb@logreport.org>

=cut

# Local Variables:
# mode: cperl
# End:
