summaryrefslogtreecommitdiff
path: root/tests/mail_to_db.pl
blob: 7886ffb7e0fc844bbbe1f6d9dcdc466e4d181206 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/usr/bin/perl
# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
# This file is public domain and comes with NO WARRANTY of any kind
#
# This program is brought to you by Janne-Petteri Koilo with the 
# administration of Michael Widenius.

# This program takes your mails and puts them into your database. It ignores
# messages with the same from, date and message text.
# You can use mail-files that are compressed or gzipped and ends with
# -.gz or -.Z.

use DBI;
use Getopt::Long;

$VER = "1.6";

$opt_db = "mail";
$opt_table = "mails";
$opt_max_mail_size = 65536;
$opt_db_engine = "mysql";
$opt_host = "localhost";
$opt_user = $opt_password = "";
$opt_help = $opt_version = $opt_test=0;

GetOptions("help","version","user=s","password=s",
	   "db_engine=s","db=s","host=s","max_mail_size=s","test") || usage();

usage($VER) if ($opt_help || $opt_version || !$ARGV[0]);

%months= ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
	  'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
	  'Nov' => 11, 'Des' => 12);

$count_no_from = $count_no_txt = $count_too_big = 0;
$count_forwarded_msgs = $count_duplicates = $no_subject = 0;
$inserted_mails = 0;
$dbh=0;

$dbh = DBI->connect("DBI:$opt_db_engine:$opt_db:$opt_host",$opt_user,
		    $opt_password,{ PrintError => 0}) || die $DBI::errstr;
if (!$opt_test)
{
  create_table_if_needed($dbh);
}

foreach (@ARGV)
{
  if (/^(.*)\.(gz|Z)$/) #checks if the file is compressed or gzipped
  {
    open(FILE, "zcat $_ |");
    process_mail_file($dbh,$1);
  }
  else
  {
    open(FILE,$_);
    process_mail_file($dbh,$_);
  }
}
$dbh->disconnect if (!$opt_test);

$ignored = $count_no_from + $count_no_txt + $count_too_big + $count_duplicates + $no_subject;
print "Mails inserted:\t\t\t$inserted_mails\n";
print "Mails ignored:\t\t\t$ignored\n";
print "Mails without \"From:\" -field:\t$count_no_from\n";
print "Mails without message:\t\t$count_no_txt\n";
print "Too big mails (> $opt_max_mail_size):\t$count_too_big\n";
print "Duplicate mails:\t\t$count_duplicates\n";
print "Forwarded mails:\t\t$count_forwarded_msgs\n";
print "No subject:\t\t\t$no_subject\n";
print "Mails altogether:\t\t"; 
print $inserted_mails+$ignored;
print "\n";
exit(0);

sub usage
{  
  my($VER)=@_;
  
  $0 =~ s/.\/(.+)/$1/;
  if ($opt_version)
  {
    print "$0 version $VER\n";
  } 
  else
  {
    print <<EOF;
$0 version $VER

Usage: $0 [options] file1 [file2 file3 ...]

Description: Inserts mails from file(s) into a database

Options:
--help             show this help and exit
--version          shows the version of the program
--db_engine=...    database server (default: $opt_db_engine)
--db=...           database to be used (default: $opt_db)
--host=...         hostname to be used (default: $opt_host)
--password=...     user password for the db server
--user=...         username for the db server
--max_mail_size=#  max size of a mail to be inserted into the db.
                   mail will be ignored if it exceeds this size
                   (default $opt_max_mail_size)
--test		   Don\'t connect to the database, just write the
		   queries to stdout
EOF
  }
  exit(0);
}

sub create_table_if_needed
{
  my ($dbh)=@_;
  my ($sth,$create);
  
  $sth = $dbh->prepare("select count(*) from $opt_table") or die $dbh->errstr;
  if (!$sth->execute)
  {
    $create = "CREATE TABLE $opt_table (msg_nro mediumint unsigned not null ";
    $create .= "auto_increment, date DATETIME NOT NULL, time_zone CHAR(6) ";
    $create .= "NOT NULL, mail_from char(120) not null, reply char(120), ";
    $create .= "mail_to TEXT, cc TEXT, sbj char(200), txt MEDIUMTEXT NOT ";
    $create .= "NULL, file char(32) noT NULL, hash INT NOT NULL, key ";
    $create .= "(msg_nro), primary key (mail_from, date, time_zone, hash))";
    $sth = $dbh->prepare($create) or die $dbh->errstr;
    $sth->execute() or die $dbh->errstr;
  }  
}

sub process_mail_file
{
  my ($dbh,$file_name)= @_;
  my (%values,$type,$check);

  %values=(); $type="";
  $check=0;

  while (<FILE>)
  {
    chop;
    if ($type ne "message")
    { 
      if (/^Reply-To: (.*)/i)  # finding different fields from file
      {
	$type="reply";
	$values{$type}= $1;
      }
      elsif (/^From: (.*)/i)
      {
	$type="from";
	$values{$type}= $1;
      }
      elsif (/^To: (.*)/i)
      {
	$type="to";
	$values{$type}= $1;
      }
      elsif (/^Cc: (.*)/i)
      {
	$type="cc";
	$values{$type}= $1;
      }
      elsif (/^Subject: (.*)/i)
      {
	$type="subject";
	$values{$type}= $1;
      }
      elsif (/^Date: (.*)/i)
      {
	date_parser($1,\%values);
	$type="rubbish";
      }
      elsif (/^[\w\W-]+:\s/)
      {
	$type="rubbish";  
      }
      elsif ($_ eq "")
      { 
	$type="message";
	$values{$type}="";
      }
      else
      {
	s/^\s*/ /;
	$values{$type}.= $_;
      }
    }
    elsif ($check!=0 && $_ ne "") # in case of forwarded messages
    {
      $values{$type}.= "\n" . $_;
      $check--;
    }
    elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d$/)
    {
      $values{'hash'}= checksum("$values{'message'}");
      update_table($dbh,$file_name,\%values);
      %values=(); $type="";
      $check=0;
    }
    elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
    {
      $values{$type}.= "\n" . $_;
      $check++;
      $count_forwarded_msgs++;
    }
    else
    {
      $values{$type}.= "\n" . $_;
    }
  }
  $values{'hash'}= checksum("$values{'message'}");
  update_table($dbh,$file_name,\%values);
}

########

# converts date to the right form

sub date_parser
{
  my ($date_raw,$values)=@_;

  $date_raw =~ /\s*(\d{1,2}) (\w+) (\d{2,4}) (\d+:\d+:\d+)\s*([\w-+]{3-5})?/;

  $values->{'date'}=$3 . "-" . $months{$2} . "-" . "$1 $4";
  $values->{'time_zone'}=$5;
}

#########

# this is runned when the whole mail is gathered.
# this actually puts the mail to the database.

sub update_table
{
  my($dbh,$file_name,$values)=@_;
  my($query);

  if (! defined($values->{'subject'}) || !defined($values->{'to'}))
  {
    $no_subject++;
    return;			# Ignore these
  }
  $values->{'message'} =~ s/^\s*//; #removes whitespaces from the beginning 
  $values->{'message'} =~ s/\s*$//; #removes whitespaces from the end
  $query = "insert into $opt_table values (NULL,'" . $values->{'date'};
  $query .= "','" . $values->{'time_zone'} . "',";
  $query .= (defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL") . ",";
  $query .= (defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL") . ",";

  $query .= (defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL") . ","; 
  $query .= (defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL") . ","; 
  $query .= $dbh->quote($values->{'subject'}) . ",";
  $query .= $dbh->quote($values->{'message'}) . "," . $dbh->quote($file_name);
  $query .= ",'" . $values->{'hash'} . "')";
  
  if (length($values->{'message'}) > $opt_max_mail_size) #disables big message
  {
    $count_too_big++;
  }
  elsif ($values->{'from'} eq "") #disables mails with no from field
  {
    $count_no_from++;
  }
  elsif ($opt_test)
  {
    print "$query\n";
    $inserted_mails++;
  }
  elsif ($values->{'message'} eq "") #disables mails with no message text
  {
    $count_no_msg_text++;
  }
  elsif ($dbh->do($query))
  {
    $inserted_mails++;
  }
  elsif (!($dbh->errstr =~ /Duplicate entry /)) #disables duplicates
  {
    die "Aborting: Got error '" . $dbh->errstr ."' for query: '$query'\n";
  }
  else
  {
    $count_duplicates++;    
  }
  $query="";
}


##########

# In case you have two identical messages we wanted to identify them
# and remove additionals;  We do this by calculating a hash number of the
# message and ignoring messages with the same from, date and hash.
# This function calculates a simple 32 bit hash value for the message.

sub checksum
{
  my ($txt)= @_;
  my ($crc,$i,$count);
  $count = length($txt);
  for ($crc = $i = 0; $i < $count ; $i++)
  {
    $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
      (($crc & (1 << 30)) ? 1 : 0);
    $crc &= ((1 << 31) -1);
  }
  return $crc;
}