Integrate mysql-to-postgres conversion

author: Jacob Vosmaer <contact@jacobvosmaer.nl> 2015-09-18 16:33:38 +0200
committer: Jacob Vosmaer <contact@jacobvosmaer.nl> 2015-09-18 16:57:33 +0200
commit: b9bbad5298e2d50bb424dad2f91a1ab75a36c687 (patch)
tree: f8671b2a21adb622d1f9e62f52e821c85b10d90b
parent: bf1e976d918bc343a0739e05c6f62c76f4035ba2 (diff)
download: gitlab-ci-b9bbad5298e2d50bb424dad2f91a1ab75a36c687.tar.gz
5 files changed, 89 insertions, 16 deletions
diff --git a/lib/backup/database.rb b/lib/backup/database.rb
index e63dcb8..3d09c8c 100644
--- a/lib/backup/database.rb
+++ b/lib/backup/database.rb
@@ -1,4 +1,5 @@
 require 'yaml'
+require 'open3'
 
 module Backup
   class Database
@@ -17,7 +18,7 @@ module Backup
       FileUtils.mkdir_p(@db_dir) unless Dir.exists?(@db_dir)
     end
 
-    def dump
+    def dump(mysql_to_postgresql=false)
       FileUtils.rm_f(db_file_name)
       compress_rd, compress_wr = IO.pipe
       compress_pid = spawn(*%W(gzip -c), in: compress_rd, out: [db_file_name, 'w', 0600])
@@ -26,7 +27,9 @@ module Backup
       dump_pid = case config["adapter"]
       when /^mysql/ then
         $progress.print "Dumping MySQL database #{config['database']} ... "
-        spawn('mysqldump', *mysql_args, config['database'], *TABLES, out: compress_wr)
+        args = mysql_args
+        args << '--compatible=postgresql' if mysql_to_postgresql
+        spawn('mysqldump', *args, config['database'], *TABLES, out: compress_wr)
       when "postgresql" then
         $progress.print "Dumping PostgreSQL database #{config['database']} ... "
         pg_env
@@ -38,6 +41,42 @@ module Backup
 
       report_success(success)
       abort 'Backup failed' unless success
+      convert_to_postgresql if mysql_to_postgresql
+    end
+
+    def convert_to_postgresql
+      mysql_dump_gz = db_file_name + '.mysql'
+      psql_dump_gz = db_file_name + '.psql'
+      drop_indexes_sql = File.join(db_dir, 'drop_indexes.sql')
+
+      File.rename(db_file_name, mysql_dump_gz)
+
+      $progress.print "Converting MySQL database dump to Postgres ... "
+      statuses = Open3.pipeline(
+        %W(gzip -cd #{mysql_dump_gz}),
+        %W(python lib/support/mysql-postgresql-converter/db_converter.py - - #{drop_indexes_sql}),
+        %W(gzip -c),
+        out: [psql_dump_gz, 'w', 0600]
+      )
+
+      if !statuses.compact.all?(&:success?)
+        abort "mysql-to-postgresql-converter failed"
+      end
+      $progress.puts '[DONE]'.green
+
+      $progress.print "Splicing in 'DROP INDEX' statements ... "
+      statuses = Open3.pipeline(
+        %W(lib/support/mysql-postgresql-converter/splice_drop_indexes #{psql_dump_gz} #{drop_indexes_sql}),
+        %W(gzip -c),
+        out: [db_file_name, 'w', 0600]
+      )
+      if !statuses.compact.all?(&:success?)
+        abort "Failed to splice in 'DROP INDEXES' statements"
+      end
+
+      $progress.puts '[DONE]'.green
+    ensure
+      FileUtils.rm_f([mysql_dump_gz, psql_dump_gz, drop_indexes_sql])
     end
 
     def restore
diff --git a/lib/support/mysql-postgresql-converter/README.md b/lib/support/mysql-postgresql-converter/README.md
index 9940ee5..f0e97cd 100644
--- a/lib/support/mysql-postgresql-converter/README.md
+++ b/lib/support/mysql-postgresql-converter/README.md
@@ -35,17 +35,14 @@ First, dump your MySQL database in PostgreSQL-compatible format
 
 Then, convert it using the dbconverter.py script.
 
-    python db_converter.py databasename.mysql databasename.psql
+    python db_converter.py databasename.mysql - drop_indexes.sql | gzip -c > databasename.unfinished.psql.gz
 
 It'll print progress to the terminal
 
 Now we have a DB dump that can be imported but the dump will be slow due
-to existing indexes. We use 'ed' to edit the DB dump file and move the
-'DROP INDEX' statements to the start of the import. Ed is not the fastest
-tool for this job if your DB dump is multiple gigabytes. (Patches to
-the converter are welcome!)
+to existing indexes.
 
-    ed -s databasename.psql < move_drop_indexes.ed
+    ./splice_drop_indexes databasename.unfinished.psql.gz drop_indexes.sql > databasename.psql
 
 Next, load your new dump into a fresh PostgreSQL database using: 
 
diff --git a/lib/support/mysql-postgresql-converter/db_converter.py b/lib/support/mysql-postgresql-converter/db_converter.py
index 8a7f8a2..38a0572 100644
--- a/lib/support/mysql-postgresql-converter/db_converter.py
+++ b/lib/support/mysql-postgresql-converter/db_converter.py
@@ -15,7 +15,7 @@ import time
 import subprocess
 
 
-def parse(input_filename, output_filename):
+def parse(input_filename, output_filename, drop_index_filename):
     "Feed it a file, and it'll output a fixed one"
 
     # State storage
@@ -44,6 +44,8 @@ def parse(input_filename, output_filename):
         output = open(output_filename, "w")
         logging = sys.stdout
 
+    drop_index = open(drop_index_filename, "w")
+
     if input_filename == "-":
         input_fh = sys.stdin
     else:
@@ -234,12 +236,9 @@ def parse(input_filename, output_filename):
     for line in sequence_lines:
         output.write("%s;\n" % line)
 
-    # This line is an anchor for move_drop_indexes.ed
-    output.write("\n-- Drop indexes --\n")
+    drop_index.write("-- Drop indexes --\n")
     for line in drop_index_lines:
-        output.write("%s;\n" % line)
-    # This line is an anchor for move_drop_indexes.ed
-    output.write("-- END Drop indexes --\n")
+        drop_index.write("%s;\n" % line)
 
     # Write indexes out
     output.write("\n-- Indexes --\n")
@@ -253,4 +252,4 @@ def parse(input_filename, output_filename):
 
 
 if __name__ == "__main__":
-    parse(sys.argv[1], sys.argv[2])
+    parse(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/lib/support/mysql-postgresql-converter/splice_drop_indexes b/lib/support/mysql-postgresql-converter/splice_drop_indexes
new file mode 100755
index 0000000..a2b5de8
--- /dev/null
+++ b/lib/support/mysql-postgresql-converter/splice_drop_indexes
@@ -0,0 +1,37 @@
+#!/bin/sh
+# This script reorders database dumps generated by db_converter.py for
+# efficient consumption by Postgres.
+
+fail() {
+  echo "$@" 1>2
+  exit 1
+}
+
+db_gz=$1
+drop_indexes_sql=$2
+
+if [ -z "$db_gz" ] || [ -z "$drop_indexes_sql" ] ; then
+  fail "Usage: $0 database.sql.gz drop_indexes.sql"
+fi
+
+# Capture all text up to the first occurence of 'SET CONSTRAINTS'
+preamble=$(zcat "$db_gz" | sed '/SET CONSTRAINTS/q')
+if [ -z "$preamble" ] ; then
+  fail "Could not read preamble"
+fi
+
+drop_indexes=$(cat "$drop_indexes_sql")
+if [ -z "$drop_indexes" ] ; then
+  fail "Could not read DROP INDEXES file"
+fi
+
+# Print preamble and drop indexes
+cat <<EOF
+${preamble}
+
+${drop_indexes}
+EOF
+
+# Print the rest of database.sql.gz. I don't understand this awk script but it
+# prints all lines after the first match of 'SET CONSTRAINTS'.
+zcat "$db_gz" | awk 'f; /SET CONSTRAINTS/ { f = 1 }'
diff --git a/lib/tasks/backup.rake b/lib/tasks/backup.rake
index 4eaa311..67c2dc4 100644
--- a/lib/tasks/backup.rake
+++ b/lib/tasks/backup.rake
@@ -3,13 +3,14 @@ namespace :backup do
   desc "GITLAB | Create a backup of the GitLab CI database"
   task create: :environment do
     configure_cron_mode
+    mysql_to_postgresql = (ENV['MYSQL_TO_POSTGRESQL'] == '1')
 
     $progress.puts "Applying final database migrations ... ".blue
     Rake::Task['db:migrate'].invoke
     $progress.puts "done".green
 
     $progress.puts "Dumping database ... ".blue
-    Backup::Database.new.dump
+    Backup::Database.new.dump(mysql_to_postgresql)
     $progress.puts "done".green
 
     $progress.puts "Dumping builds ... ".blue
author	Jacob Vosmaer <contact@jacobvosmaer.nl>	2015-09-18 16:33:38 +0200
committer	Jacob Vosmaer <contact@jacobvosmaer.nl>	2015-09-18 16:57:33 +0200
commit	b9bbad5298e2d50bb424dad2f91a1ab75a36c687 (patch)
tree	f8671b2a21adb622d1f9e62f52e821c85b10d90b
parent	bf1e976d918bc343a0739e05c6f62c76f4035ba2 (diff)
download	gitlab-ci-b9bbad5298e2d50bb424dad2f91a1ab75a36c687.tar.gz