#!/usr/bin/perl use DBI; use Getopt::Long; use Data::Dumper; use File::Spec; $opt_ot_rows=100000; $opt_ot_2_it=5; $opt_database="test"; $opt_host="localhost"; $opt_socket=""; $opt_port=""; $opt_datadir=""; $opt_user='root'; $opt_password=''; %data_filename=( ot =>"outer.txt", it=>"inner.txt" ); $opt_generate=$opt_load_data=""; GetOptions("ot-rows=s","ot-2-it=s", "database=s", "host=s","socket=s","port=s", "user=s","password=s", "datadir=s", "generate","load-data") || usage(); usage() unless ($opt_generate || $opt_load_data); $outer_rows=$opt_ot_rows; $inner_rows=$outer_rows*$opt_ot_2_it; if ($opt_datadir && ! -d "$opt_datadir") { die <rel2abs($opt_datadir); if ($opt_generate) { print<$filename") || die "Can't open file $filename"; $outer_id=new my_ordinal($outer_rows, 1, $outer_rows); $outer_key=new my_uniform($outer_rows, 0, 1000000000, $outer_rows); $outer_multi=new my_uniform($outer_rows, 0, 1000, 1000); for ($i=0;$i<$outer_rows;$i++) { $outer_expr_key=$outer_expr_nokey=$outer_key->get_next_value(); print OUTER join(",",$outer_id->get_next_value(), $outer_expr_key, $outer_expr_nokey, $outer_multi->get_next_value()),"\n"; } close(OUTER); } sub generate_inner_table() { my ($filename)=@_; open(INNER,">$filename") || die "Can't open file $filename" ; $inner_id=new my_ordinal($inner_rows, 1, $inner_rows); $inner_key=new my_uniform($inner_rows, 0, 1000000000, $inner_rows); for ($i=0;$i<$inner_rows;$i++) { $inner_expr_key=$inner_expr_nokey=$inner_key->get_next_value(); print INNER join(",",$inner_id->get_next_value(), $inner_expr_key, $inner_expr_nokey),"\n"; } close(INNER); } sub load_data() { %create_table_stmt=( ot => "create table ot (id int not null, expr_key int not null, expr_nokey int not null, expr_multi int not null, primary key (id), key (expr_key))", it => "create table it (id int not null, expr_key int not null, expr_nokey int not null, primary key (id), key (expr_key))"); $drop_table_stmt="drop table if exists %s"; $load_data_stmt="load data infile '%s' into table %s FIELDS TERMINATED BY ','"; $dsn="dbi:mysql:$opt_database;host=$opt_host"; $dsn.= ";mysql_socket=$opt_socket" if ($opt_socket); $dbh=DBI->connect($dsn, $opt_user, $opt_password) || die "Can't connect to MySQL server"; foreach $table (ot,it) { do_query($dbh,sprintf($drop_table_stmt,$table)); do_query($dbh,$create_table_stmt{$table}); $filename=$opt_datadir."/".$data_filename{$table}; if ( -f $filename ) { do_query($dbh,sprintf($load_data_stmt,$filename,$table)); } else { print "ERROR: Can't read file: $filename to load data for table: $table\n"; } } } sub do_query { my($dbh, $query)=@_; print "$query\n" if ($opt_debug); $dbh->do($query) or die "\nError executing '$query':\n$DBI::errstr\n"; } sub usage() { print < Directory where script will store result files or look for dataset files to load --generate Generate dataset that contsits of two tables 'outer' and 'inner' and store generated data in plan text files with the same names --ot-rows= Number of rows in outer(ot) table --ot-2-in= Relation between outer and inner tables(ot:in as 1:N) --load-data Load generated dataset with --generate option --database= --host= --user= --password= --socket= EOF } package my_ordinal; sub new { my ($class,$nrows,$min,$max)=@_; my $self={}; bless $self,$class; $self->{'nrows'}= $nrows; $self->{'min'}= $min; $self->{'max'}= $max; $self->{'value'}= 0; $self->{'resvalue'}= ''; $self->{'range'}= $max-$min; $self->{'step'}= 1; #range sparse? if ($self->{'nrows'} < $self->{'range'}) { $self->{'step'}=$self->{'range'}/($self->{'nrows'}-1); } return $self; } sub get_next_value { my ($self)=@_; $self->{'resvalue'}= sprintf("%.0f", $self->{'min'} + $self->{'value'}); $self->{'value'}+= $self->{'step'}; if ($self->{'value'} > $self->{'range'}) { $self->{'value'}=0; } return $self->{'resvalue'}; } package my_uniform; sub new { my ($class,$nrows,$min,$max,$uniques)=@_; my $self={}; bless $self,$class; @root=(2, 7, 26, 59, 242, 568, 1792, 5649, 16807, 30001, 60010, 180001, 360002, 1000001, 2000000); @prime=(11, 101, 1009, 10007, 100003, 1000003, 10000019, 100000007, 2147483647, 10000000019, 100000000003, 1000000000039, 10000000000037, 100000000000031, 1000000000000037); $self->{'nrows'}= $nrows; $self->{'min'}= $min; $self->{'max'}= $max; $self->{'uniques'}= $uniques; $self->{'range'}= $max-$min; $self->{'index'}=(log($self->{'range'})/log(10)); $self->{'G'}= $root[$self->{'index'}]; $self->{'P'}= $prime[$self->{'index'}]; $self->{'seed'}= $self->{'G'}; $self->{'uniques_count'}=0; return $self; } sub get_next_value() { my ($self)=@_; if ($self->{'uniques_count'} >= $self->{'uniques'}) { $self->{'G'}= $root[$self->{'index'}]; $self->{'P'}= $prime[$self->{'index'}]; $self->{'seed'}= $self->{'G'}; $self->{'uniques_count'}=0; } $self->{'resvalue'}=$self->{'min'}+$self->get_uniform(); $self->{'uniques_count'}++; return $self->{'resvalue'}; } sub get_uniform() { my ($self)=@_; $self->{'seed'} = ($self->{'G'} * $self->{'seed'}) % $self->{'P'}; while ( $self->{'seed'} > $self->{'range'}) { $self->{'seed'}= ($self->{'G'}* $self->{'seed'}) % $self->{'P'}; } return ($self->{'seed'} - 1); }