#!/bin/perl # # loadtest will populate an on-disk RDF store with data from BEGIN {unshift@INC,('../../../','../../','../');} use RDF::API::Database; use RDF::RDFWeb::Node; use RDF::RDFWeb::XRDFDataSource; use RDF::RDFWeb::MemDB; use RDF::RDFdb; use strict; use Carp; my %cmd; # commandline args from --arg-name=value-passed my %prov; # exploring provenance tracking my %canon; # used by --loadfilter=fold my %MonoProperty; # uniquely identifying predicates # $MonoProperty{'http://xmlns.com/foaf/0.1/mbox'}=1; # shouldn't hardcode this while ($_ = shift()) { next unless ($_ =~ s/^--//); my($lhs,$rhs)=split(/=/,$_,2); $lhs=lc($lhs); chomp $rhs; $cmd{$lhs}=$rhs if ($rhs); } #foreach (keys %cmd) { print "arg: $_ value $cmd{$_} \n";} my $parser = $cmd{'parser'} || 'http://cara.sourceforge.net/'; my $data = $cmd{'data'} ; my $skip = $cmd{'skip'}; my $select = $cmd{'select'}; my $op = 'view'; if ($cmd{'op'}) { $op = $cmd{'op'}; } else { die ("No operation specified (view/load/md5/etc...)\n"); # print "Defaulting to --op=view to view triples [ **NOT LOADING INTO DB!**]\n"; } if (!$data && !$cmd{'batch'} ) { die ("Must specify data source(s) using --data= or --batch= "); } #################################################################### ## ## Figure out which batch of files to load my %skiptypes; # source to skip my %selecttypes; # source to skip foreach (split(/,/,$skip) ) { $skiptypes{$_}=1;} foreach (split(/,/,$select) ) { $selecttypes{$_}=1;} my %skip; # sources to skip my %select; my %batch; # sources in the batch my %final; if ($cmd{'batch'}) { open (IN,"$cmd{'batch'}") || croak ("Can't load batchlist ",$cmd{'batch'}); while(my $src = ) { chomp $src; next unless $src; next if m/#/; my ($res,$flavours)=split(/\s+/,$src,2); $batch{$res}=1; # add to batch, subtract later ## loop through categories that apply to this src $flavours =~ m/\[([^\]]+)\]/; foreach (split (/,/,$1)) { if (($skip) && $skiptypes{$_}) { $skip{$res}=1; } if (($select) && $selecttypes{$_}) { $select{$res}=1; } if (!$select && !$skip) { $final{$res}=1; } } } } # print "SELECT: ",keys %select ,"\n"; # print "SKIP: ",keys %skip ,"\n"; if ($skip) { foreach (keys %batch) { $final{$_}=1 unless ($skip{$_});} # remove } if ($select) { foreach (keys %batch) { $final{$_}=1 if ($select{$_});} # remove } print "FINAL SELECT: ",keys %final,"\n" if ($select || $skip); my $localdb = $cmd{'localdb'} || './tmp/'; # stored here as default db $final{ $cmd{'data'} } = 1 if ($cmd{'data'}); if ($cmd{'op'} eq 'load') { open (STATE, ">STATE") || die "can't open state file"; print STATE "#loader statelog for $localdb (past history unknown)\n"; # todo: should only be for load-log } #################################################################### #################################################################### #################################################################### my $db; ## set up connection to database before we do anything if ($op eq 'load' || $cmd{'loadfilter'} ) { print "Connecting to rdfdb on disk...\n"; #hmmxxx: (trying to get rid of some of this complexity) $db = RDF::API::Database->new(); # $db = new RDF::RDFdb( undef, $localdb.'default' ); $db->set('module'=>'RDF::RDFdb'); $db->set('rw'=>'1'); $db->set('datastore'=> $localdb.'default'); $db->open(); ## database workaround #$db->assert('http://rdfweb.org/2001/01/swipe-ns#swiper','from-test','to-test'); $db->index(); $db->close(); $db->open(); } ## loop through the data files foreach my $todo (keys %final) { print "Doing [ $op ] on $todo \n"; my $mem = new RDF::RDFWeb::MemDB; # in memory database my $ds = new RDF::RDFWeb::XRDFDataSource( $parser ); $ds->{'baseuri'}=$todo; my $rdf; eval { $ds->get( $todo ); $rdf = data $ds; }; if ($@) { print STDERR "Error loading data with $parser from $todo .\n"; print STDERR $@; } print "Got xrdf: $rdf\n"; print "Got db: $db\n"; print STATE "LOADED: data: $todo datalines: ". scalar @{ $rdf } ."\n"; my %known; foreach my $t ( @{ $rdf } ) { my %s = %{ $t }; my ($p,$s,$o) = ( $s{'p'}, $s{'s'}, $s{'o'} ); print "FIRST PASS: $p ( $s , $o ) \n"; #xxxxxx ###$mem->assert (value $p, value $s, value $o); if ($MonoProperty{$p}){ print "MONO: $p ON: $s \n"; print "$s is a genid\n" if isGenId $s; print "MONO LOOK: ", localcheck($p, $o, $db), "\n"; } } print "\n=====================\nmemory database now holds:\n"; print $mem->report(); print "\n=====================\n2nd pass of processing\n"; foreach my $t ( @{ $rdf } ) { my %s = %{ $t }; my ($p,$s,$o) = ( $s{'p'}, $s{'s'}, $s{'o'} ); print "[ $todo ] data: '$s' -- '$p' --> '$o' \n" if ($op eq 'view'); print md5_statement($p,$s,$o)."\n" if ($op eq 'md5'); # here we normalise incoming triples based on cardinality constraints etc # if ($cmd{'loadfilter'} eq 'fold') { ($p,$s,$o) = loadfilter($p,$s,$o, $db ); print "LOADFILTER db=$db: new data: '$s -- '$p' --> '$o' \n"; } $db->assert($p,$s,$o) if ($op eq 'load'); print "[ $todo ] loaded: '$s' -- '$p' --> '$o' \n" if ($op eq 'load'); } print "End [ $todo ].\n"; } # end loop through loader tasks $db->close if ($op eq 'load'); undef $db; print "End. Closed database $db. \n"; close STATE; ## Complain about missing commandline args; display xample usage. ## sub noparam { my $missing; my $msg; print STDERR "Error: $msg\nUse $missing.\n"; print STDERRR "Usage: $0 --data=.... [ --parser=... ] [ --db=... ]\n"; } sub md5_statement { my $in=shift; my $out; eval { use Digest::MD5 qw(md5_hex); $out = "data:,".md5_hex($in); }; if ($@) { print STDERR "MD5 error. ",$@;} return $out; # todo: handle module dependency more reponsibly... } # sometimes we want to filter triple before assertion # this method gets hooked in when --loadfilter=fold # # how it will work: # first we pass through the data looking for objects that # we may already have a unique ID for. We do this by looking for # triples with a predicate of a uniquely identifying kind. # These (if generated IDs) go into a list of resources we might want # to use an existing URI for. # # Second, we pass through looking for cases where we're learning the real # URI for something we previously made up a generated ID for. # sub loadfilter{ my $predicate=shift; my $subject=shift; my $object=shift; my $dbref=shift; my $db = $dbref; # print "FILTERING: .... DB=$db\n"; # $o = "xxx" . $o . "zzz";# prove this hook works # node folding code from scutter # if the assertion uses a 'monoproperty' as its predicate... if ($MonoProperty{$predicate}){ print "SMUSH: Attemping query: pred=$predicate object=$object \n"; my @t = $db->queryAndReturn($predicate,'',$object); print "SMUSH: DEBUG: result: [ ".join(" ; ",@t)." ]\n\n"; my $p=shift @t; my $s=shift @t; my $o=shift @t; # Not a good API... print "SMUSH: asked pred=$predicate sub=null obj=$object got: p=$p s=$s o=$o \n\n"; if ($s) { print "SMUSH: MonoProperty $predicate where value $object has source we know as: $s\n"; print "SMUSH: LocallyCanonicalising $subject to $s\n\n"; $canon{$subject}=$s; $subject = $s; } else { print "SMUSH: MonoProperty $predicate where value $object has no existing internal ID.\n"; } # this is graceless, ugly, wrong... if ($canon{$subject}) { dbgCanon("subject",$subject, $canon{$subject} ); $subject = $canon{$subject}; } if ($canon{$predicate}) { dbgCanon("predicate",$predicate, $canon{$predicate} ); $predicate = $canon{$predicate}; } if ($canon{$object}) { dbgCanon("object",$object, $canon{$object} ); $object = $canon{$object}; } sub dbgCanon { my ($x,$old,$new)=@_; print "Folding $x : from $old to $new \n"; } $db->assert($predicate,$subject,$object); } else { print "LOADFILTER: Predicate $predicate not uniquely identifying\n"; } ##end of rdfweb scutter code my @s = ($predicate,$subject,$object); # todo: find a common way of passing triples around return @s; } # example: gen_1 - foaf:mbox -> mailto:danbri@w3.org # sub localcheck { my ($predicate, $object) = @_; my $db=shift; my @t = $db->queryAndReturn($predicate,'',$object); print "LOCAL-CHECK: [ ".join(" ; ",@t)." ]\n\n"; my $p=shift @t; my $s=shift @t; my $o=shift @t; # Not a good API... print "LOCAL-CHECK: asked pred=$predicate sub=null obj=$object got: p=$p s=$s o=$o \n\n"; return $s; }