#!/usr/bin/perl use DBI; use Getopt::Long; use Data::Dumper; use File::Spec; $opt_ot_rows=100000; $opt_fanout=5; $opt_database="test"; $opt_host="localhost"; $opt_socket=""; $opt_port=""; $opt_datadir=""; $opt_user='root'; $opt_password=''; %data_filename=( ot =>"outer.txt", it=>"inner.txt" ); $opt_generate=$opt_load_data=""; GetOptions("ot-rows=s","fanout=s", "database=s", "host=s","socket=s","port=s", "user=s","password=s", "datadir=s", "generate","load-data") || usage(); usage() unless ($opt_generate || $opt_load_data); #Tables characteristics $outer_rows=$opt_ot_rows; $inner_rows=$outer_rows*$opt_fanout; $outer_range_min=0; $outer_range_max=1000000000; $inner_range_min=0; $inner_range_max=1000000000; $outer_table=( id => {min=>0, max=>$outer_rows, uniq=>$outer_rows, type=>ordinal }, expr_key=> { min=>0, max=>$outer_range_max, uniq=>$outer_rows, type=>uniform }, expr_nokey=> { min=>0, max=>$outer_range_max, uniq=>$outer_rows, type=>uniform }, expr_multi=> { min=>0, max=>1000, uniq=>$outer_rows/1000, type=>uniform }); $inner_table=( id => {min=>0, max=>$inner_rows, uniq=>$inner_rows, type=>ordinal }, expr_key=> { min=>0, max=>$inner_range_max, uniq=>$outer_rows, type=>uniform }, expr_nokey=> { min=>0, max=>$inner_range_max, uniq=>$outer_rows, type=>uniform }); if ($opt_datadir && ! -d "$opt_datadir") { die <rel2abs($opt_datadir); if ($opt_generate) { print<$filename") || die "Can't open file $filename"; $outer_id=new my_ordinal($outer_rows, 1, $outer_rows); $outer_key=new my_uniform($outer_rows, 0, 1000000000, $outer_rows); $outer_multi=new my_uniform($outer_rows, 0, 1000, 1000); $outer_padder=new my_alphanum($outer_rows, 4, 100); for ($i=0;$i<$outer_rows;$i++) { $outer_expr_key=$outer_expr_nokey=$outer_key->get_next_value(); print OUTER join(",",$outer_id->get_next_value(), $outer_expr_key, $outer_expr_nokey, $outer_multi->get_next_value(),$outer_padder->get_next_value()),"\n"; } close(OUTER); } sub generate_inner_table() { my ($filename)=@_; open(INNER,">$filename") || die "Can't open file $filename" ; $inner_id=new my_ordinal($inner_rows, 1, $inner_rows); $inner_key=new my_uniform($inner_rows, 0, 1000000000, $outer_rows); $inner_padder=new my_alphanum($inner_rows, 4, 100); for ($i=0;$i<$inner_rows;$i++) { $inner_expr_key=$inner_expr_nokey=$inner_key->get_next_value(); print INNER join(",",$inner_id->get_next_value(), $inner_expr_key, $inner_expr_nokey,$inner_padder->get_next_value()),"\n"; } close(INNER); } sub load_data() { %create_table_stmt=( ot => "create table ot (id int not null, expr_key int not null, expr_nokey int not null, expr_multi int not null,expr_padder char(10), primary key (id), key (expr_key))", it => "create table it (id int not null, expr_key int not null, expr_nokey int not null,expr_padder char(10), primary key (id), key (expr_key))"); $drop_table_stmt="drop table if exists %s"; $load_data_stmt="load data infile '%s' into table %s FIELDS TERMINATED BY ','"; $dsn="dbi:mysql:$opt_database;host=$opt_host"; $dsn.= ";mysql_socket=$opt_socket" if ($opt_socket); $dbh=DBI->connect($dsn, $opt_user, $opt_password) || die "Can't connect to MySQL server"; foreach $table (ot,it) { do_query($dbh,sprintf($drop_table_stmt,$table)); do_query($dbh,$create_table_stmt{$table}); $filename=$opt_datadir."/".$data_filename{$table}; if ( -f $filename ) { do_query($dbh,sprintf($load_data_stmt,$filename,$table)); } else { print "ERROR: Can't read file: $filename to load data for table: $table\n"; } } } sub do_query { my($dbh, $query)=@_; print "$query\n" if ($opt_debug); $dbh->do($query) or die "\nError executing '$query':\n$DBI::errstr\n"; } sub usage() { print < Directory where script will store result files or look for dataset files to load --generate Generate dataset that contsits of two tables 'outer' and 'inner' and store generated data in plan text files with the same names --ot-rows= Number of rows in outer(ot) table(Default:100000) --fanout= Factor that indicate how many record from the outer table has in the inner table(Default: 5) --load-data Load generated dataset with --generate option --database= --host= --user= --password= --socket= EOF } package my_ordinal; sub new { my ($class,$nrows,$min,$max)=@_; my $self={}; bless $self,$class; $self->{'nrows'}= $nrows; $self->{'min'}= $min; $self->{'max'}= $max; $self->{'value'}= 0; $self->{'resvalue'}= ''; $self->{'range'}= $max-$min; $self->{'step'}= 1; #range sparse? if ($self->{'nrows'} < $self->{'range'}) { $self->{'step'}=$self->{'range'}/($self->{'nrows'}-1); } return $self; } sub get_next_value { my ($self)=@_; $self->{'resvalue'}= sprintf("%.0f", $self->{'min'} + $self->{'value'}); $self->{'value'}+= $self->{'step'}; if ($self->{'value'} > $self->{'range'}) { $self->{'value'}=0; } return $self->{'resvalue'}; } package my_uniform; # Generator below is able to genarate sequence of uniformly # distributed values in range [0..10^15] # # It is possible to define range and number of uniques values # that will be generated in this range sub new { my ($class,$nrows,$min,$max,$uniq)=@_; my $self={}; bless $self,$class; @root=(2, 7, 26, 59, 242, 568, 1792, 5649, 16807, 30001, 60010, 180001, 360002, 1000001, 2000000); @prime=(11, 101, 1009, 10007, 100003, 1000003, 10000019, 100000007, 2147483647, 10000000019, 100000000003, 1000000000039, 10000000000037, 100000000000031, 1000000000000037); $self->{'nrows'}= $nrows; $self->{'min'}= $min; $self->{'max'}= $max; $self->{'uniq'}= $uniq; $self->{'range'}= $max-$min; $self->{'index'}=(log($self->{'range'})/log(10)); $self->{'R'}= $root[$self->{'index'}]; $self->{'P'}= $prime[$self->{'index'}]; $self->{'seed'}= $self->{'R'}; $self->{'uniq_count'}=0; return $self; } sub get_next_value() { my ($self)=@_; if ($self->{'uniq_count'} >= $self->{'uniq'}) { $self->{'R'}= $root[$self->{'index'}]; $self->{'P'}= $prime[$self->{'index'}]; $self->{'seed'}= $self->{'R'}; $self->{'uniq_count'}=0; } $self->{'resvalue'}=$self->{'min'}+$self->get_uniform(); $self->{'uniq_count'}++; return $self->{'resvalue'}; } sub get_uniform() { my ($self)=@_; $self->{'seed'} = ($self->{'R'} * $self->{'seed'}) % $self->{'P'}; while ( $self->{'seed'} > $self->{'range'}) { $self->{'seed'}= ($self->{'R'}* $self->{'seed'}) % $self->{'P'}; } return ($self->{'seed'} - 1); } package my_alphanum; sub new { my ($class,$nrows,$len,$uniques)=@_; my $self={}; bless $self,$class; $self->{'nrows'}= $nrows; $self->{'len'}= $len; $self->{'uniques'}= $uniques; $self->{'charset'}= ['.', '/',' ',0..9, 'A'..'Z', 'a'..'z']; $self->{'charset_size'}= $#{$self->{'charset'}}; #todo: add uniq part $self->{'count'}=0; return $self; } sub get_next_value() { my ($self)=@_; $self->{'string'}=""; if ($self->{'count'} % $self->{'uniques'}) { for ($i=0;$i<$self->{'len'};$i++) { $self->{'string'}.=@{$self->{'charset'}}[rand $self->{'charset_size'}]; } } else { $self->{'string'}="@{$self->{'charset'}}[rand $self->{'charset_size'}]"x$self->{'len'}; } $self->{'count'}++; return $self->{'string'}; }