#!/usr/bin/perl

=head1 NAME

seek-phrases-in-corpus - given a corpus of spam, seek out common phrases

=head1 SYNOPSIS

seek-phrases-in-corpus [--grep 'pattern'] ham:dir:/path spam:dir:/path2 ...

=head1 DESCRIPTION

Given a _small_ corpus of ham and spam mails (specified in mass-check format),
this will attempt to find patterns that appear in at least 2 spams, then list
out all the patterns that have a 1.0 S/O ratio (ie. hit spam and no ham).

The output format looks like:

 1.000   8.633   0.000  /pattern/, /pattern2/, /pattern3/
 1.000   8.633   0.000  /pattern4/
 1.000  10.000   0.000  /pattern5/

First field is S/O (and will always be 1.000).  Second, the SPAM%
figure -- how much of the spam corpus, as a percentage, contains the
pattern.  Third is the list of one or more pattern(s) that hit this
subset of messages.

Note that patterns that hit a different subset of the messages in the spam
corpus, are listed on separate lines; e.g., in the example above, /pattern3/
and /pattern4/ both hit 8.633% of the spam corpus -- however, they hit a
different 8.633%, not the same subset of messages.  On the other hand,
/pattern2/ and /pattern3/ are hitting exactly the same messages.

The patterns are simple substrings, not regular expressions; don't
be misled by the use of "/" as a delimiter.  The body text is rendered
as SpamAssassin "body" rendering.

=cut

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# ---------------------------------------------------------------------------

use Getopt::Long;
use Carp qw(croak);
use FindBin;

use strict;
use warnings;

my $sadir = "$FindBin::Bin/../..";
my $tmpdir = "/tmp/findpats.tmp.$$";

my %opt = ();
GetOptions(
  'grep=s'       => \$opt{grep},
) or die "see perldoc for usage";

my $mcargs = ' '.join(' ', @ARGV).' ';

# extract just the ham or spam targets
my $mcargs_h = $mcargs; $mcargs_h =~ s/ spam:\S+ //gs;
my $mcargs_s = $mcargs; $mcargs_s =~ s/ ham:\S+ //gs;

if ($mcargs_h !~ /\bham:/) {
  die "seek-phrases-in-corpus: no 'ham:type:path' corpus specifier found!\n";
}
if ($mcargs_s !~ /\bspam:/) {
  die "seek-phrases-in-corpus: no 'spam:type:path' corpus specifier found!\n";
}

my $re = $opt{grep};

# ---------------------------------------------------------------------------

(-d "$tmpdir/cor") and run ("rm -rf $tmpdir/cor");
(-d "$tmpdir/cor") or run ("mkdir -p $tmpdir/cor");

# note: -c=/dev/null so no rules ever run
# don't grep the ham set!
run("cd $sadir/masses && ".
    "./mass-check --cf='loadplugin Dumptext plugins/Dumptext.pm' ".
    " --cf='loadplugin Mail::SpamAssassin::Plugin::Check' ".
    " --cf='loadplugin GrepRenderedBody plugins/GrepRenderedBody.pm' ".
    " -n -o --showdots -c=/dev/null ".
    " $mcargs_h > $tmpdir/w.h");

# *do* grep the spam, though
run("cd $sadir/masses && ".
    "./mass-check --cf='loadplugin Dumptext plugins/Dumptext.pm' ".
    " --cf='loadplugin Mail::SpamAssassin::Plugin::Check' ".
    " --cf='loadplugin GrepRenderedBody plugins/GrepRenderedBody.pm' ".
    ($re ? " --cf='grep $re' " : "").
    " -n -o --showdots -c=/dev/null ".
    " $mcargs_s > $tmpdir/w.s");

run("perl -w $sadir/masses/rule-dev/seek-phrases-in-log ".
        "--ham $tmpdir/w.h --spam $tmpdir/w.s > $tmpdir/result");

run("cat $tmpdir/result");
exit;

# ---------------------------------------------------------------------------

sub run {
  my $cmd = shift;
  warn "[$cmd]\n";
  system $cmd;
  ($? >> 8 != 0) and Carp::croak("command failed");
}

