From 485b97be9f2f2abf5a40923b5fd85f75714a8c02 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Mon, 24 Sep 2012 10:15:50 +0000 Subject: Imported from /srv/lorry/lorry-area/perl-dbd-sqlite-tarball/DBD-SQLite-1.38_01.tar.gz. --- lib/DBD/SQLite/Cookbook.pod | 194 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 lib/DBD/SQLite/Cookbook.pod (limited to 'lib/DBD/SQLite') diff --git a/lib/DBD/SQLite/Cookbook.pod b/lib/DBD/SQLite/Cookbook.pod new file mode 100644 index 0000000..8b40c5b --- /dev/null +++ b/lib/DBD/SQLite/Cookbook.pod @@ -0,0 +1,194 @@ +=head1 NAME + +DBD::SQLite::Cookbook - The DBD::SQLite Cookbook + +=head1 DESCRIPTION + +This is the L cookbook. + +It is intended to provide a place to keep a variety of functions and +formals for use in callback APIs in L. + +=head1 AGGREGATE FUNCTIONS + +=head2 Variance + +This is a simple aggregate function which returns a variance. It is +adapted from an example implementation in pysqlite. + + package variance; + + sub new { bless [], shift; } + + sub step { + my ( $self, $value ) = @_; + + push @$self, $value; + } + + sub finalize { + my $self = $_[0]; + + my $n = @$self; + + # Variance is NULL unless there is more than one row + return undef unless $n || $n == 1; + + my $mu = 0; + foreach my $v ( @$self ) { + $mu += $v; + } + $mu /= $n; + + my $sigma = 0; + foreach my $v ( @$self ) { + $sigma += ($v - $mu)**2; + } + $sigma = $sigma / ($n - 1); + + return $sigma; + } + + # NOTE: If you use an older DBI (< 1.608), + # use $dbh->func(..., "create_aggregate") instead. + $dbh->sqlite_create_aggregate( "variance", 1, 'variance' ); + +The function can then be used as: + + SELECT group_name, variance(score) + FROM results + GROUP BY group_name; + +=head2 Variance (Memory Efficient) + +A more efficient variance function, optimized for memory usage at the +expense of precision: + + package variance2; + + sub new { bless {sum => 0, count=>0, hash=> {} }, shift; } + + sub step { + my ( $self, $value ) = @_; + my $hash = $self->{hash}; + + # by truncating and hashing, we can comsume many more data points + $value = int($value); # change depending on need for precision + # use sprintf for arbitrary fp precision + if (exists $hash->{$value}) { + $hash->{$value}++; + } else { + $hash->{$value} = 1; + } + $self->{sum} += $value; + $self->{count}++; + } + + sub finalize { + my $self = $_[0]; + + # Variance is NULL unless there is more than one row + return undef unless $self->{count} > 1; + + # calculate avg + my $mu = $self->{sum} / $self->{count}; + + my $sigma = 0; + while (my ($h, $v) = each %{$self->{hash}}) { + $sigma += (($h - $mu)**2) * $v; + } + $sigma = $sigma / ($self->{count} - 1); + + return $sigma; + } + +The function can then be used as: + + SELECT group_name, variance2(score) + FROM results + GROUP BY group_name; + +=head2 Variance (Highly Scalable) + +A third variable implementation, designed for arbitrarily large data sets: + + package variance3; + + sub new { bless {mu=>0, count=>0, S=>0}, shift; } + + sub step { + my ( $self, $value ) = @_; + $self->{count}++; + my $delta = $value - $self->{mu}; + $self->{mu} += $delta/$self->{count}; + $self->{S} += $delta*($value - $self->{mu}); + } + + sub finalize { + my $self = $_[0]; + return $self->{S} / ($self->{count} - 1); + } + +The function can then be used as: + + SELECT group_name, variance3(score) + FROM results + GROUP BY group_name; + +=head1 FTS3 fulltext indexing + +=head2 Sparing database disk space + +As explained in L, each +FTS3 table C> is stored internally within three regular tables +C_content>, C_segments> and C_segdir>. The last two +tables contain the fulltext index. The first table C_content> +stores the complete documents being indexed ... but if copies of the +same documents are already stored somewhere else, or can be computed +from external resources (for example as HTML or MsWord files in the +filesystem), then this is quite a waste of space. SQLite itself only +needs the C_content> table for implementing the C and +C functions, which are not always usable anyway (in particular +when using utf8 characters greater than 255). + +So an alternative strategy is to use SQLite only for the fulltext +index and metadata, and to keep the full documents outside of SQLite : +to do so, after each insert or update in the FTS3 table, do an update +in the C_content> table, setting the content column(s) to +NULL. Of course your application will need an algorithm for finding +the external resource corresponding to any I stored within +SQLite. Furthermore, SQLite C and C functions +cannot be used, so if such functionality is needed, it has to be +directly programmed within the Perl application. +In short, this strategy is really a hack, because FTS3 was not originally +programmed with that behaviour in mind; however it is workable +and has a strong impact on the size of the database file. + +=head1 SUPPORT + +Bugs should be reported via the CPAN bug tracker at + +L + +=head1 TO DO + +* Add more and varied cookbook recipes, until we have enough to +turn them into a separate CPAN distribution. + +* Create a series of tests scripts that validate the cookbook recipies. + +=head1 AUTHOR + +Adam Kennedy Eadamk@cpan.orgE + +Laurent Dami Edami@cpan.orgE + +=head1 COPYRIGHT + +Copyright 2009 - 2012 Adam Kennedy. + +This program is free software; you can redistribute +it and/or modify it under the same terms as Perl itself. + +The full text of the license can be found in the +LICENSE file included with this module. -- cgit v1.2.1