#!/usr/local/bin/perl
# mysqldumpslow - parse and summarize the MySQL slow query log

# Original version by Tim Bunce, sometime in 2000.
# Further changes by Tim Bunce, 8th March 2001.
# Handling of strings with \ and double '' by Monty 11 Aug 2001.
# Show database name by Thierry Randrianiriana 31st Jul 2006.
# inverse grep (G), rows_examined processing, IN and
#  VALUES abstraction by Nils Goroll, 2006-2007
#  see http://bugs.mysql.com/bug.php?id=7414

use strict;
use Getopt::Long;

# t=time, l=lock time, r=rows
# at, al, and ar are the corresponding averages

my %opt = (
    s => 'at',
    h => '*',
);

GetOptions(\%opt,
    'verbose|v+',	# verbose
    'help+',		# write usage info
    'debug|d+',		# debug
    's=s',		# what to sort by (t, at, l, al, r, ar etc)
    'r!',		# reverse the sort order (largest last instead of first)
    't=i',		# just show the top n queries
    'a!',		# don't abstract all numbers to N and strings to 'S'
    'ain!',		# don't abstract IN (...)
    'avalues!',		# don't abstract VALUES (...)
    'awhitespace!',	# don't normalise arbitrary whitespace
    'n=i',		# abstract numbers with at least n digits within names
    'g=s',		# grep: only consider stmts that include this string
    'G=s',		# inverse grep: only consider stmts that DO NOT include this string
    'h=s',		# hostname of db server for *-slow.log filename (can be wildcard)
    'i=s',		# name of server instance (if using mysql.server startup script)
    'l!',		# don't subtract lock time from total time
) or usage("bad option");

$opt{'help'} and usage();

unless (@ARGV) {
    my $defaults   = `my_print_defaults mysqld`;
    my $basedir = ($defaults =~ m/--basedir=(.*)/)[0]
	or die "Can't determine basedir from 'my_print_defaults mysqld' output: $defaults";
    warn "basedir=$basedir\n" if $opt{verbose};

    my $datadir = ($defaults =~ m/--datadir=(.*)/)[0];
    my $slowlog = ($defaults =~ m/--log-slow-queries=(.*)/)[0];
    if (!$datadir or $opt{i}) {
	# determine the datadir from the instances section of /etc/my.cnf, if any
	my $instances  = `my_print_defaults instances`;
	die "Can't determine datadir from 'my_print_defaults mysqld' output: $defaults"
	    unless $instances;
	my @instances = ($instances =~ m/^--(\w+)-/mg);
	die "No -i 'instance_name' specified to select among known instances: @instances.\n"
	    unless $opt{i};
	die "Instance '$opt{i}' is unknown (known instances: @instances)\n"
	    unless grep { $_ eq $opt{i} } @instances;
	$datadir = ($instances =~ m/--$opt{i}-datadir=(.*)/)[0]
	    or die "Can't determine --$opt{i}-datadir from 'my_print_defaults instances' output: $instances";
	warn "datadir=$datadir\n" if $opt{verbose};
    }

    if ( -f $slowlog ) {
        @ARGV = ($slowlog);
        die "Can't find '$slowlog'\n" unless @ARGV;
    } else {
        @ARGV = <$datadir/$opt{h}-slow.log>;
        die "Can't find '$datadir/$opt{h}-slow.log'\n" unless @ARGV;
    }
}

warn "\nReading mysql slow query log from @ARGV\n";

my @pending;
my %stmt;
my $db;
$/ = ";\n#";		# read entire statements using paragraph mode

while ( defined($_ = shift @pending) or defined($_ = <>) ) {
    warn "[[$_]]\n" if $opt{debug};	# show raw paragraph being read

    ## Remove the MySQL startup message and process anything which
    ## is not the message itself by prepending it to @pending
    my @chunks = split /\S+\s+Version.*started\s+with.*\n.*\nTime\s+Id\s+Command\s+Argument\s*\n/m;
    if (@chunks > 1) {
	unshift @pending, map { length($_) ? $_ : () } @chunks;
	warn "<<".join(">>\n<<",@chunks).">>" if $opt{debug};
	next;
    }

    if(/(?:use|connect)\s+(\w+)/i) {
	$db=$1;
    }

    s/^#? Time: \d{6}\s+\d+:\d+:\d+.*\n//;
    my ($user,$host) = s/^#? User\@Host:\s+(\S+)\s+\@\s+(\S+).*\n// ? ($1,$2) : ('','');

    # Query_time: 3  Lock_time: 0  Rows_sent: 3  Rows_examined: 532233
    s/^# Query_time: (\d+)  Lock_time: (\d+)  Rows_sent: (\d+)  Rows_examined: (\d+).*\n//;
    my ($t, $l, $r, $re) = ($1, $2, $3, $4);
    $t -= $l unless $opt{l};

    ## slink@mcs.de 20071022: this should never match (see @chunks split above)
    ## # remove fluff that mysqld writes to log when it (re)starts:
    ## s!^/.*Version.*started with:.*\n!!mg;
    ## s!^Tcp port:\s+\d+\s+Unix socket:\s+\S+\n!!mg;
    ## s!^Time.*Id.*Command.*Argument.*\n!!mg;

    s/^(?:use|connect)\s+\w+;\n//;	# not consistently added
    s/^SET timestamp=\d+;\n//;

    s/^[ \t]*\n//mg;		# delete blank lines
    s/^[ \t]*/  /mg;		# normalize leading whitespace
    s/\s*;\s*(#\s*)?$//;	# remove trailing semicolon(+newline-hash)

    next if $opt{g} and !m/$opt{g}/io;
    next if $opt{G} and m/$opt{G}/io;

    # abstract strings - do this before IN (...) and VALUES (...)
    # to get correct results for values containg quotes
    unless ($opt{a}) {
        s/''/'S'/g;
        s/""/"S"/g;
        s/(\\')//g;
        s/(\\")//g;
        s/'[^']+'/'S'/g;
        s/"[^"]+"/"S"/g;
    }

    unless ($opt{ain}) {
	s/IN\s*\([^\)]*\)/IN (...)/gi;
    }

    unless ($opt{avalues}) {
	# this RE is not optimal (too greedy - eats everything after VALUES (),()... which ends with )
	# but the correct version of it triggers a perl bug (MCS: see CTS#58124)
	## correct code triggering bug: s/VALUES([,\s]*\([^\)]+\)){1,}/VALUES (...)/gi;
	s/VALUES\s*\(.+\)/VALUES (...)/gi;
    }

    unless ($opt{awhitespace}) {
	## slink@mcs.de: we can also abstract any duplicate whitespace - there is no
	## string we could destroy because strings are already reduced to S
	## in the code above - but we need to re-normalise leading whitespace again
	s/\s+/ /g;
        s/^[ \t]*/  /;
    }

    unless ($opt{a}) {
	s/\b\d+\b/N/g;
	s/\b0x[0-9A-Fa-f]+\b/N/g;
	# -n=8: turn log_20001231 into log_NNNNNNNN
	s/([a-z_]+)(\d{$opt{n},})/$1.('N' x length($2))/ieg if $opt{n};
	# abbreviate massive "in (...)" statements and similar
	s!(([NS],){100,})!sprintf("$2,{repeated %d times}",length($1)/2)!eg;
    }

    my $s = $stmt{$_} ||= { users=>{}, hosts=>{} };
    $s->{c} += 1;
    $s->{t} += $t;
    $s->{l} += $l;
    $s->{r} += $r;
    $s->{re} += $re;
    $s->{users}->{$user}++ if $user;
    $s->{hosts}->{$host}++ if $host;
    $s->{dbs}->{$db}++ if $db;

    warn "{{$_}}\n\n" if $opt{debug};	# show processed statement string
}

foreach (keys %stmt) {
    my $v = $stmt{$_} || die;
    my ($c, $t, $l, $r, $re) = @{ $v }{qw(c t l r re)};
    $v->{at} = $t / $c;
    $v->{al} = $l / $c;
    $v->{ar} = $r / $c;
    $v->{are} = $re / $c;
}

my @sorted = sort { $stmt{$b}->{$opt{s}} <=> $stmt{$a}->{$opt{s}} } keys %stmt;
@sorted = @sorted[0 .. $opt{t}-1] if $opt{t};
@sorted = reverse @sorted         if $opt{r};

foreach (@sorted) {
    my $v = $stmt{$_} || die;
    my ($c, $t,$at, $l,$al, $r,$ar,$re,$are) = @{ $v }{qw(c t at l al r ar re are)};
    my @users = keys %{$v->{users}};
    my $user  = (@users==1) ? $users[0] : sprintf "%dusers",scalar @users;
    my @hosts = keys %{$v->{hosts}};
    my $host  = (@hosts==1) ? $hosts[0] : sprintf "%dhosts",scalar @hosts;
    my @dbs = keys %{$v->{dbs}};
    my $db = (@dbs==1) ? $dbs[0] : sprintf "%ddbs",scalar @dbs;
    printf "Count: %d  Time=%.2fs (%ds)  Lock=%.2fs (%ds)  Rows=%.1f (%d)  Rows examined=%.1f (%d), Database: %s $user\@$host\n%s\n\n",
	    $c, $at,$t, $al,$l, $ar,$r, $are, $re, $db, $_;
}

sub usage {
    my $str= shift;
    my $text= <<HERE;
Usage: mysqldumpslow [ OPTS... ] [ LOGS... ]

Parse and summarize the MySQL slow query log. Options are

  --verbose    verbose
  --debug      debug
  --help       write this text to standard output

  -v           verbose
  -d           debug
  -s ORDER     what to sort by, 'at' is default

               absolute values:
               c  : Query count
               t  : Query time
               l  : Lock time
               r  : Rows sent
               re : Rows examined
               
               for average values (average per query), prepend an a (except for c)
               at, al, ar, are

  -r           reverse the sort order (largest last instead of first)
  -t NUM       just show the top n queries
  -a           don't abstract all numbers to N and strings to 'S'
  -ain         don't abstract IN (something) to 'IN (...)'
  -avalues     don't abstract VALUES (Something) to 'VALUES (...)'
  -awhitespace don't normalise arbitrary whitespace (preserve original formatting)
               note: leading whitespace is normalised always
  -n NUM       abstract numbers with at least n digits within names
  -g PATTERN   grep: only consider stmts that include this string
  -G PATTERN   inverse grep: only consider stmts that do not include this string
  -h HOSTNAME  hostname of db server for *-slow.log filename (can be wildcard),
               default is '*', i.e. match all
  -i NAME      name of server instance (if using mysql.server startup script)
  -l           don't subtract lock time from total time

HERE
    if ($str) {
      print STDERR "ERROR: $str\n\n";
      print STDERR $text;
      exit 1;
    } else {
      print $text;
      exit 0;
    }
}
