1 files changed, 194 insertions, 0 deletions
diff --git a/lib/DBD/SQLite/Cookbook.pod b/lib/DBD/SQLite/Cookbook.pod
new file mode 100644
index 0000000..8b40c5b
--- /dev/null
+++ b/lib/DBD/SQLite/Cookbook.pod
@@ -0,0 +1,194 @@
+=head1 NAME
+
+DBD::SQLite::Cookbook - The DBD::SQLite Cookbook
+
+=head1 DESCRIPTION
+
+This is the L<DBD::SQLite> cookbook.
+
+It is intended to provide a place to keep a variety of functions and
+formals for use in callback APIs in L<DBD::SQLite>.
+
+=head1 AGGREGATE FUNCTIONS
+
+=head2 Variance
+
+This is a simple aggregate function which returns a variance. It is
+adapted from an example implementation in pysqlite.
+
+  package variance;
+  
+  sub new { bless [], shift; }
+  
+  sub step {
+      my ( $self, $value ) = @_;
+  
+      push @$self, $value;
+  }
+  
+  sub finalize {
+      my $self = $_[0];
+  
+      my $n = @$self;
+  
+      # Variance is NULL unless there is more than one row
+      return undef unless $n || $n == 1;
+  
+      my $mu = 0;
+      foreach my $v ( @$self ) {
+          $mu += $v;
+      }
+      $mu /= $n;
+  
+      my $sigma = 0;
+      foreach my $v ( @$self ) {
+          $sigma += ($v - $mu)**2;
+      }
+      $sigma = $sigma / ($n - 1);
+  
+      return $sigma;
+  }
+  
+  # NOTE: If you use an older DBI (< 1.608),
+  # use $dbh->func(..., "create_aggregate") instead.
+  $dbh->sqlite_create_aggregate( "variance", 1, 'variance' );
+
+The function can then be used as:
+
+  SELECT group_name, variance(score)
+  FROM results
+  GROUP BY group_name;
+
+=head2 Variance (Memory Efficient)
+
+A more efficient variance function, optimized for memory usage at the
+expense of precision:
+
+  package variance2;
+  
+  sub new { bless {sum => 0, count=>0, hash=> {} }, shift; }
+  
+  sub step {
+      my ( $self, $value ) = @_;
+      my $hash = $self->{hash};
+  
+      # by truncating and hashing, we can comsume many more data points
+      $value = int($value); # change depending on need for precision
+                            # use sprintf for arbitrary fp precision
+      if (exists $hash->{$value}) {
+          $hash->{$value}++;
+      } else {
+          $hash->{$value} = 1;
+      }
+      $self->{sum} += $value;
+      $self->{count}++;
+  }
+  
+  sub finalize {
+      my $self = $_[0];
+  
+      # Variance is NULL unless there is more than one row
+      return undef unless $self->{count} > 1;
+  
+      # calculate avg
+      my $mu = $self->{sum} / $self->{count};
+  
+      my $sigma = 0;
+      while (my ($h, $v) = each %{$self->{hash}}) {
+          $sigma += (($h - $mu)**2) * $v;
+      }
+      $sigma = $sigma / ($self->{count} - 1);
+  
+      return $sigma;
+  }
+
+The function can then be used as:
+
+  SELECT group_name, variance2(score)
+  FROM results
+  GROUP BY group_name;
+
+=head2 Variance (Highly Scalable)
+
+A third variable implementation, designed for arbitrarily large data sets:
+
+  package variance3;
+  
+  sub new { bless {mu=>0, count=>0, S=>0}, shift; }
+  
+  sub step {
+      my ( $self, $value ) = @_;
+      $self->{count}++;
+      my $delta = $value - $self->{mu};
+      $self->{mu} += $delta/$self->{count};
+      $self->{S} += $delta*($value - $self->{mu});
+  }
+  
+  sub finalize {
+      my $self = $_[0];
+      return $self->{S} / ($self->{count} - 1);
+  }
+
+The function can then be used as:
+
+  SELECT group_name, variance3(score)
+  FROM results
+  GROUP BY group_name;
+
+=head1 FTS3 fulltext indexing
+
+=head2 Sparing database disk space
+
+As explained in L<http://www.sqlite.org/fts3.html#section_6>, each
+FTS3 table C<I<t>> is stored internally within three regular tables
+C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>.  The last two
+tables contain the fulltext index.  The first table C<I<t>_content>
+stores the complete documents being indexed ... but if copies of the
+same documents are already stored somewhere else, or can be computed
+from external resources (for example as HTML or MsWord files in the
+filesystem), then this is quite a waste of space. SQLite itself only
+needs the C<I<t>_content> table for implementing the C<offsets()> and
+C<snippet()> functions, which are not always usable anyway (in particular
+when using utf8 characters greater than 255).
+
+So an alternative strategy is to use SQLite only for the fulltext
+index and metadata, and to keep the full documents outside of SQLite :
+to do so, after each insert or update in the FTS3 table, do an update
+in the C<I<t>_content> table, setting the content column(s) to
+NULL. Of course your application will need an algorithm for finding
+the external resource corresponding to any I<docid> stored within
+SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
+cannot be used, so if such functionality is needed, it has to be
+directly programmed within the Perl application.
+In short, this strategy is really a hack, because FTS3 was not originally
+programmed with that behaviour in mind; however it is workable
+and has a strong impact on the size of the database file.
+
+=head1 SUPPORT
+
+Bugs should be reported via the CPAN bug tracker at
+
+L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DBD-SQLite>
+
+=head1 TO DO
+
+* Add more and varied cookbook recipes, until we have enough to
+turn them into a separate CPAN distribution.
+
+* Create a series of tests scripts that validate the cookbook recipies.
+
+=head1 AUTHOR
+
+Adam Kennedy E<lt>adamk@cpan.orgE<gt>
+
+Laurent Dami E<lt>dami@cpan.orgE<gt>
+
+=head1 COPYRIGHT
+
+Copyright 2009 - 2012 Adam Kennedy.
+
+This program is free software; you can redistribute
+it and/or modify it under the same terms as Perl itself.
+
+The full text of the license can be found in the
+LICENSE file included with this module.