Scroll down to see the rest of this webpage
Play Classical Gas here
In my last blog I included the code I wrote for the web crawler that powers my homepage slideshow. When I wrote it I was worried about things like dead websites, bad links, malicious code, etc. I through out most of that error correcting code, and made the crawler a lot simpler; it works better, too.
#!/usr/bin/perl use strict; use warnings; use Time::HiRes ('sleep'); use LWP::Simple; use LWP::UserAgent; use Net::FTP; use Try::Tiny; my(@links,%crawled,$cur_link,$var,$link_var,$temp,$pic,$ua,$index); $ua = new LWP::UserAgent; $ua->timeout(120); my @UrlArr; $UrlArr[0] = "http://www.ultimatecarpage.com"; $UrlArr[1] = "http://fuckyeahferrari.tumblr.com"; $UrlArr[2] = "http://a2zracer.com"; $UrlArr[3] = "DavePics/AllPics.html"; $UrlArr[4] = "DavePics/SSJ/Pictures.html"; $UrlArr[5] = "DavePics/AlbertoAscari/AlbertoAscari.html"; $UrlArr[6] = "DavePics/ExcaliburSS/ExcaliburSS.html"; $UrlArr[7] = "DavePics/StutzBearcat/StutzBearcat.html"; $UrlArr[8] = "DavePics/Ferrari330P4/Ferrari330P4.html"; for (my $indexCntr = 0; $indexCntr < scalar(@UrlArr); $indexCntr++) { push(@links, $UrlArr[$indexCntr]); print "$UrlArr[$indexCntr]/n"; } foreach $cur_link (@links) { if($cur_link=~/^http/) { # in the next few lines, we retrieve the page content chomp($cur_link); $cur_link =~ s/\r$//; $crawled{$cur_link} = 1 if defined $cur_link; print "Just got crawled value\n"; my $request = new HTTP::Request('GET', $cur_link); #print "Just made request to web page: $!\n"; my $response; if ($response = $ua->request($request)) { print "Just got a response from the web page\n"; } else { print "$!\n"; } #print "Get the page contents\n"; $var = $response->content(); $link_var = $var; #print "parse the image tags out of the content\n"; my @p_pics =$var =~ /]+>/g; #print @p_pics; #if ther are are no images on this page, skip it. my $arraySize = @p_pics; #print $arraySize; my $source = ""; foreach $temp(@p_pics) { my $local_temp = substr $temp, 10; my $char_pos = index($local_temp, '"'); $temp = substr $local_temp, 0, $char_pos; if(index($temp, "http") == -1) { my $first = substr($temp, 0, 1); if ($first eq '/') { $temp=$cur_link.$temp; } elsif ($first eq '.') { $temp = substr($temp, 3); my $result = rindex($temp, '/'); $temp = substr($temp, 0, $result); $temp = $cur_link.$temp; } else { $temp=$cur_link.'/'.$temp; } } $temp =~ /\bhttp?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/; # Only interested in files that are > 64K in size my($type,$size); $size = 0; $type, $size = head($temp); #print $size; #print "print temp to a file so a web page can use it as the src for an img tag.\n"; open (MYFILE, '>data.txt'); print MYFILE $temp; close (MYFILE); print "Just wrote ".$temp." to data.txt\n"; sleep(0.25); my $file = 'data.txt'; my $host = 'ip address of host server'; my $user = 'username'; my $pass = 'password'; my $dir = 'directory of community-info.org on host server'; my $ftp = Net::FTP->new($host, Debug => 0); try { $ftp->login($user, $pass); $ftp->cwd($dir); $ftp->put($file); } catch { print "failed to upload data.txt\n"; } finally { $ftp-> quit; }; print "Just uploaded data.txt\n"; } print "\nCurrently Scanning -- ".$cur_link; #In the next line we extract all links in the current page my @p_links= $var=~/<a href=\"(.*?)\">/g; foreach $temp(@p_links) { if((!($temp=~/^http/))&&($temp=~/^\//)) { #This part of the code lets us correct internal addresses $temp=$cur_link.$temp; } #In the next line we add the links chomp($temp); $temp =~ s/\r$//; if ($crawled{$temp} != 1) { push(@links,$temp); } } #get rid of the top element of the links list, so we don't run out of mem @links.shift(); } }