This is my solution to raking the internet with Perl. Just populate the array with the extension you want. I use the dark art of Google hacks to get the information back.
There is a two part loop running off this. Part one build a list of Url where the data can be collected Hash of Array using the extension as the base. Stage two of the loop builds a list of links for the data then starts downloading.
#! /usr/perl -w
use Google::Search;
use LWP::Simple;
use HTML::LinkExtor;
use Cwd;
#file extension array
my @extentions = qw(xls xlsx doc docx jpeg gif png);//ext to get
my %searchResults = ();
my %downloadlist = ();
my @linklist = ();
#file reanme propertys
my $dir = '/Users/Robert/workspace/perltest/dir1';
my $cwd = getcwd();
print $cwd;
my $LinkExtor = HTML::LinkExtor->new( \%links );
#search for extenstions and build a hash of hash
foreach $ext (@extentions) {
my $search = Google::Search->Web( query => "indexof $ext" );
while ( my $result = $search->next ) {
#list the url of search
print "working with extension: $ext " . $result->rank,
" the search address ", $result->uri, "\n";
$URL = get( $result->uri );
$LinkExtor = HTML::LinkExtor->new( \&links );
#use the LinkExtor Sub
$LinkExtor->parse($URL);
#build Hash table.
$downloadlist{$ext}{ $result->uri } = [@linklist]
#if ( $#linklist >= 0 );
}
#build a direcotry for the downloads lable by extention
mkdir $ext, 0777 unless -d $ext;
}
print "-----------------------------------------------------\n";
print "finish building table of links now for the down load \n";
print "-----------------------------------------------------\n";
#download the files using the hash of hash table
#{extention}->{uri}->{document}
foreach $exten ( keys(%downloadlist) ) {
#exten is the extion we are working with ie doc
print "working on extention $exten";
foreach $url ( keys( %{ $downloadlist{$exten} } ) ) {
print "\t|-- $url\n";
#elm means the file name
foreach $elm ( @{ $downloadlist{$exten}{$url} } ) {
#concat the url with elment to make the full downloadable link
$downloadURI = $url . $elm;
#check to see if we have the file befor downloading
if ( -e "$exten/$elm" ) {
#file download check
print "got the file: $elm already\n";
}
else {
my $status = getstore( $downloadURI, "$exten/$elm" );
if ( is_success($status) ) {
print "got a file from \t\t|-- $url$elm\n";
}
else {
#http status code 404?
print "Couldn't retrieve page: $status\n $downloadURI\n";
} #end of download
} #end of file check
} #end of loop
}
}
print "Finish!!!!!!!!!!!";
sub links {
( $tag, %links ) = @_;
if ( $tag eq "a" ) {
foreach $key ( keys %links ) {
if ( $links{$key} =~ m/.$ext$/ ) {
print "\t\t---->Link Found at ", $links{$key}, "\n ";
push( @linklist, $links{$key} );
} #end if
} #end foreach
} #end if
} #end sub