Scroll down to see the rest of this webpage
Play Classical Gas here
In my last blog I included the code I wrote for the web crawler that powers my homepage slideshow. When I wrote it I was worried about things like dead websites, bad links, malicious code, etc. I through out most of that error correcting code, and made the crawler a lot simpler; it works better, too.
#!/usr/bin/perl
use strict;
use warnings;
use Time::HiRes ('sleep');
use LWP::Simple;
use LWP::UserAgent;
use Net::FTP;
use lib '/usr/local/share/perl/5.30.3/auto/Data/Validate/URI';
use Data::Validate::URI;
use Try::Tiny;
my(@links,%crawled,$cur_link,$var,$link_var,$temp,$pic,$ua,$index);
$ua = new LWP::UserAgent;
$ua->timeout(120);
my $uriValidator = new Data::Validate::URI();
my $bell = chr(7);
my @UrlArr;
$UrlArr[0] = "http://kingsizejuggs.com";
$UrlArr[1] = "http://allnude.sexy";
$UrlArr[2] = "http://sexygirlspics.com";
$UrlArr[3] = "http://bigtitsbreasts.com/";
$UrlArr[4] = "http://maturenudewomen.net";
$UrlArr[5] = "DavePics/OpenPussy/OpenPussy.html";
$UrlArr[6] = "DavePics/AllPics.html";
$UrlArr[7] = "DavePics/SSJ/Pictures.html";
$UrlArr[8] = "DavePics/AlbertoAscari/AlbertoAscari.html";
$UrlArr[9] = "DavePics/ExcaliburSS/ExcaliburSS.html";
$UrlArr[10] = "DavePics/StutzBearcat/StutzBearcat.html";
$UrlArr[11] = "DavePics/Ferrari330P4/Ferrari330P4.html";
$UrlArr[12] = "http://www.ultimatecarpage.com";
$UrlArr[13] = "http://wallpaperswide.com";
$UrlArr[14] = "http://maturehomemadeporn.com";
$UrlArr[15] = "http://aepics.com";
$UrlArr[16] = "http://tophugeboobs.com";
$UrlArr[17] = "http://nudemodels.sexy";
$UrlArr[18] = "http://bigtitswebcams.net";
for (my $indexCntr = 0; $indexCntr < scalar(@UrlArr); $indexCntr++)
{
push(@links, $UrlArr[$indexCntr]);
print $UrlArr[$indexCntr]."\n";
}
foreach $cur_link (@links)
{
if($cur_link=~/^http/)
{
# in the next few lines, we retrieve the page content
chomp($cur_link);
$cur_link =~ s/\r$//;
$crawled{$cur_link} = 1 if defined $cur_link;
print "Just got crawled value\n";
my $request = new HTTP::Request('GET', $cur_link);
#print "Just made request to web page: $!\n";
my $response;
if ($response = $ua->request($request))
{
print "Just got a response from the web page\n";
}
else
{
print "$!\n";
}
if ($response->is_success)
{
my $message = $response->decoded_content;
print "Received reply ".$message."\n";
}
else
{
print "HTTP GET error code: ".$response->code."\n";
print "HTTP GET error message: ".$response->message."\n";
next;
}
#print "Get the page contents\n";
$var = $response->content();
$link_var = $var;
#print "parse the image tags out of the content\n";
my @p_pics =$var =~ /<img src=\"[^>]+>/g;
#print @p_pics;
#if ther are are no images on this page, skip it.
try
{
my $arraySize = scalar(@p_pics);
if ($arraySize < 1)
{
next;
}
else
{
print "This page has ".$arraySize." images\n";
}
}
catch
{
print "$!\n";
};
foreach $temp(@p_pics)
{
$index = index($temp, "powweb");
if ($index > -1)
{
next;
}
my $local_temp = substr $temp, 10;
my $char_pos = index($local_temp, '"');
$temp = substr $local_temp, 0, $char_pos;
if(index($temp, "http") == -1)
{
my $first = substr($temp, 0, 1);
if ($first eq '/')
{
$temp=$cur_link.$temp;
}
elsif ($first eq '.')
{
$temp = substr($temp, 3);
my $result = rindex($temp, '/');
$temp = substr($temp, 0, $result);
$temp = $cur_link.$temp;
}
else
{
$temp=$cur_link.'/'.$temp;
}
}
$temp =~ /\bhttp?:[^)''"\s]+\.(?:jpg|JPG|jpeg|JPEG|gif|GIF|png|PNG)/;
open (MYFILE, '>data.txt');
print MYFILE $temp;
close (MYFILE);
print "Just wrote ".$temp." to data.txt\n";
my $file = 'data.txt';
my $host = 'ip address of host server';
my $user = 'username';
my $pass = 'password';
my $dir = 'directory for community-info.org on host server';
my $ftp = Net::FTP->new($host, Debug => 0);
print $bell;
try
{
#lots of sites use transparent.gif files between images
$index = index($temp, "trans");
if ($index == -1)
{
$index = index($temp,"powweb");
if ($index == -1)
{
$ftp->login($user, $pass);
$ftp->cwd($dir);
$ftp->put($file);
print "Just uploaded data.txt\n";
sleep(1.5);
}
else
{
print "Didn't upload ".$temp."\n";
}
}
else
{
print "Didn't upload data.txt because it is a transparency\n";
}
}
catch
{
print "failed to upload data.txt\n";
}
finally
{
$ftp-> quit;
};
}
print "\nCurrently Scanning -- ".$cur_link;
# In the next line we extract all links
my @p_links = $var=~/<a href=\"(.*?)\">/g;
foreach $temp(@p_links)
{
if((!($temp=~/^http/))&&($temp=~/^\//))
{
#This part of the code lets us correct internal addresses
$temp=$cur_link.$temp;
}
# In the next line we add the links to the links list.
print "We are going to add ".$temp." to the links array\n";
chomp($temp);
$temp =~ s/\r$//;
if ($uriValidator->is_web_uri($temp))
{
push(@links,$temp) if not defined $crawled{$temp};
print "just added ".$temp." to the end of the links list\n";
}
else
{
print "Didn't add ".$temp." because it is not a valid url.\n";
}
#print "now the links array looks like: \n";
#foreach $temp (@links)
#{
# print $temp."\n";
#}
}
#get rid of the top element of the links list, so we don't run out of mem
$temp = shift(@links);
print "just removed ".$temp." from the front of the links list\n";
}
}
I recently updated gas.html, too. Got rid of the problem with non-displayable images; I now display an image from my home directory. Made a few more changes to the coding that also speeds up the slideshow (by simplifying, or optmizing, the code). Here is my new code for gas.html:
<!DOCTYPE html>
<html>
<body>
<div>
<img id="imageObject" src="images/Sophia-Loren-Mercedes-Benz-300SL-Gullwing-1955.jpg">
</div>
<script>
function loadDoc(url, myFunction)
{
var xhttp;
xhttp=new XMLHttpRequest();
xhttp.onreadystatechange = function() {
if (this.readyState == 4 && this.status == 200) {
myFunction(this);
}
};
var newURL = url+'?'+Math.random()*Math.random();
xhttp.open("GET", newURL, true);
xhttp.setRequestHeader('Cache-Control', 'no-cache');
xhttp.send();
}
function myFunction(xhttp)
{
text = xhttp.responseText;
if ((text.toLowerCase().indexOf("http") == -1) || (text.toLowerCase().indexOf("trans.gif") > -1))
{
text = "DavePics/pic05.jpg";
}
if (text.toLowerCase().indexOf('trans') > -1)
{
text = "DavePics/MB300SLR.jpg";
}
if (text.length < 12)
{
text = "http://lustfulbabespics.com/g/b4f563/th_03.jpg";
}
var img = new Image();
img.onload = function()
{
var height = img.height;
var width = img.width;
var imgDivWidth = 1.1 * document.body.offsetWidth;
if (width < 100)
{
document.getElementById("imageObject").height = 540;
document.getElementById("imageObject").width = 490;
document.getElementById("imageObject").src = "DavePics/BettyPageBikini.jpg";
document.getElementById("imageObject").src = text;
}
else if (width > 490)
{
document.getElementById("imageObject").style.height = height * imgDivWidth/width;
document.getElementById("imageObject").style.width = imgDivWidth;
document.getElementById("imageObject").src = text;
}
else
{
document.getElementById("imageObject").height = height;
document.getElementById("imageObject").width = width;
document.getElementById("imageObject").src = text;
}
}
img.src = text;
setTimeout( "loadDoc('data.txt', myFunction)",10);
}
loadDoc('data.txt', myFunction);
</script>
</body>
</html>