Scroll down to see the rest of this webpage
Play Classical Gas here
In my last blog I included the code I wrote for the web crawler that powers my homepage slideshow. When I wrote it I was worried about things like dead websites, bad links, malicious code, etc. I through out most of that error correcting code, and made the crawler a lot simpler; it works better, too.
#!/usr/bin/perl
use strict;
use warnings;
use Time::HiRes ('sleep');
use LWP::Simple;
use LWP::UserAgent;
use Net::FTP;
use Try::Tiny;
my(@links,%crawled,$cur_link,$var,$link_var,$temp,$pic,$ua,$index);
$ua = new LWP::UserAgent;
$ua->timeout(120);
my @UrlArr;
$UrlArr[0] = "http://www.ultimatecarpage.com";
$UrlArr[1] = "http://fuckyeahferrari.tumblr.com";
$UrlArr[2] = "http://a2zracer.com";
$UrlArr[3] = "DavePics/AllPics.html";
$UrlArr[4] = "DavePics/SSJ/Pictures.html";
$UrlArr[5] = "DavePics/AlbertoAscari/AlbertoAscari.html";
$UrlArr[6] = "DavePics/ExcaliburSS/ExcaliburSS.html";
$UrlArr[7] = "DavePics/StutzBearcat/StutzBearcat.html";
$UrlArr[8] = "DavePics/Ferrari330P4/Ferrari330P4.html";
for (my $indexCntr = 0; $indexCntr < scalar(@UrlArr); $indexCntr++)
{
push(@links, $UrlArr[$indexCntr]);
print "$UrlArr[$indexCntr]/n";
}
foreach $cur_link (@links)
{
if($cur_link=~/^http/)
{
# in the next few lines, we retrieve the page content
chomp($cur_link);
$cur_link =~ s/\r$//;
$crawled{$cur_link} = 1 if defined $cur_link;
print "Just got crawled value\n";
my $request = new HTTP::Request('GET', $cur_link);
#print "Just made request to web page: $!\n";
my $response;
if ($response = $ua->request($request))
{
print "Just got a response from the web page\n";
}
else
{
print "$!\n";
}
#print "Get the page contents\n";
$var = $response->content();
$link_var = $var;
#print "parse the image tags out of the content\n";
my @p_pics =$var =~ /
]+>/g;
#print @p_pics;
#if ther are are no images on this page, skip it.
my $arraySize = @p_pics;
#print $arraySize;
my $source = "";
foreach $temp(@p_pics)
{
my $local_temp = substr $temp, 10;
my $char_pos = index($local_temp, '"');
$temp = substr $local_temp, 0, $char_pos;
if(index($temp, "http") == -1)
{
my $first = substr($temp, 0, 1);
if ($first eq '/')
{
$temp=$cur_link.$temp;
}
elsif ($first eq '.')
{
$temp = substr($temp, 3);
my $result = rindex($temp, '/');
$temp = substr($temp, 0, $result);
$temp = $cur_link.$temp;
}
else
{
$temp=$cur_link.'/'.$temp;
}
}
$temp =~ /\bhttp?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/;
# Only interested in files that are > 64K in size
my($type,$size);
$size = 0;
$type, $size = head($temp);
#print $size;
#print "print temp to a file so a web page can use it as the src for an img tag.\n";
open (MYFILE, '>data.txt');
print MYFILE $temp;
close (MYFILE);
print "Just wrote ".$temp." to data.txt\n";
sleep(0.25);
my $file = 'data.txt';
my $host = 'ip address of host server';
my $user = 'username';
my $pass = 'password';
my $dir = 'directory of community-info.org on host server';
my $ftp = Net::FTP->new($host, Debug => 0);
try
{
$ftp->login($user, $pass);
$ftp->cwd($dir);
$ftp->put($file);
}
catch
{
print "failed to upload data.txt\n";
}
finally
{
$ftp-> quit;
};
print "Just uploaded data.txt\n";
}
print "\nCurrently Scanning -- ".$cur_link;
#In the next line we extract all links in the current page
my @p_links= $var=~/<a href=\"(.*?)\">/g;
foreach $temp(@p_links)
{
if((!($temp=~/^http/))&&($temp=~/^\//))
{
#This part of the code lets us correct internal addresses
$temp=$cur_link.$temp;
}
#In the next line we add the links
chomp($temp);
$temp =~ s/\r$//;
if ($crawled{$temp} != 1)
{
push(@links,$temp);
}
}
#get rid of the top element of the links list, so we don't run out of mem
@links.shift();
}
}