 In my last blog I included the code I wrote for the web crawler that powers my homepage slideshow. When I wrote it I was worried about things like dead websites, bad links, malicious code, etc. I through out most of that error correcting code, and made the crawler a lot simpler; it works better, too.

use strict;	
use warnings;
use Time::HiRes ('sleep');
use LWP::Simple;
use LWP::UserAgent;
use Net::FTP;
use Try::Tiny;

$ua = new LWP::UserAgent;

my @UrlArr;
$UrlArr[0] = "";
$UrlArr[1] = "";
$UrlArr[2] = "";
$UrlArr[3] = "DavePics/AllPics.html";
$UrlArr[4] = "DavePics/SSJ/Pictures.html";
$UrlArr[5] = "DavePics/AlbertoAscari/AlbertoAscari.html";
$UrlArr[6] = "DavePics/ExcaliburSS/ExcaliburSS.html";
$UrlArr[7] = "DavePics/StutzBearcat/StutzBearcat.html";
$UrlArr[8] = "DavePics/Ferrari330P4/Ferrari330P4.html";

for (my $indexCntr = 0; $indexCntr < scalar(@UrlArr); $indexCntr++)
   push(@links, $UrlArr[$indexCntr]);
   print "$UrlArr[$indexCntr]/n";

foreach $cur_link (@links)
	            # in the next few lines, we retrieve the page content
                $cur_link =~ s/\r$//;                               
                $crawled{$cur_link} = 1 if defined $cur_link;
                print "Just got crawled value\n";               
	            my $request = new HTTP::Request('GET', $cur_link);
                #print "Just made request to web page: $!\n";
                my $response;
                if ($response = $ua->request($request))
                    print "Just got a response from the web page\n";
	               print "$!\n";
                #print "Get the page contents\n";     
                $var = $response->content();
                $link_var = $var;   
                #print "parse the image tags out of the content\n";
                my @p_pics =$var =~ /]+>/g;
                #print @p_pics;
                #if ther are are no images on this page, skip it.
                my $arraySize = @p_pics;
                #print $arraySize;
                my $source = "";
                foreach $temp(@p_pics)          
                     my $local_temp = substr $temp, 10;  
                     my $char_pos = index($local_temp, '"');
                     $temp = substr $local_temp, 0, $char_pos;  
                     if(index($temp, "http") == -1)
		          my $first = substr($temp, 0, 1);
                          if ($first eq '/')
                          elsif ($first eq '.')
                              $temp = substr($temp, 3);
                              my $result = rindex($temp, '/');
                              $temp = substr($temp, 0, $result);
                              $temp = $cur_link.$temp;
                     $temp =~ /\bhttp?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/;         
                     # Only interested in files that are > 64K in size
                     $size = 0;
                     $type, $size = head($temp);  
                     #print $size;                   
                     #print "print temp to a file so a web page can use it as the src for an img tag.\n";
                           open (MYFILE, '>data.txt'); 
                           print MYFILE $temp;
                           close (MYFILE);  
                           print "Just wrote ".$temp." to data.txt\n";
                           my $file = 'data.txt'; 
                           my $host = 'ip address of host server';
                           my $user = 'username';
                           my $pass = 'password';
                           my $dir  = 'directory of on host server';
                           my $ftp  = Net::FTP->new($host, Debug => 0);
                              $ftp->login($user, $pass);
							   print "failed to upload data.txt\n";
							   $ftp-> quit;
                           print "Just uploaded data.txt\n";
                print "\nCurrently Scanning -- ".$cur_link;
	            #In the next line we extract all links in the current page
                my @p_links= $var=~/<a href=\"(.*?)\">/g;
                foreach $temp(@p_links)
			           #This part of the code lets us correct internal addresses
		            #In the next line we add the links
                    $temp =~ s/\r$//;  
					if ($crawled{$temp} != 1)
                #get rid of the top element of the links list, so we don't run out of mem

