I have been hacking more PERL and this is my latest creation. My cup of
java went empty today, so it looks like this will be all for today. :)
The script goes down a directory tree of html pages,
sucks up the content of each page, tests all the http links in the page,
and generates a web page report showing the BAD LINKS. No one has bad
links on their site, right! :) Well, if you do, I hope you find this tool
useful. I also put this script along with my other site tools on my
Coindeperl page
http://www.brie.com/coinduperl/
Here's some sample output.
http://brie.com/testlinks.html
The link checker requires the LWP modules. You can find LWP from your favorite
CPAN archive. Check
for an archive near you.
Oh, the source of course. Its on coinduperl too. Feel free to hack it and use it
to your desire.
#!/usr/bin/perl
use HTTP::Request;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
use File::Find;
use strict;
my $reposit = "/tmp/test" . $$ . ".txt"; # This is only a temporary file.
my $base_url = "http://www.brie.com";
my $base_dir = "/home/www/htdocs";
if (! -f "$base_dir/testlinks.html") {
open (OUTFILE , ">$base_dir/testlinks.html");
} else {
die "$base_dir/testlinks.html already exists.\n Remove or move it before running this\n";
}
open (REPOSIT,">$reposit");
finddepth(\&wanted, $base_dir);
close (REPOSIT);
open (INREPOSIT,"<$reposit"); # Pages to evaluate
my $oldhandle = select(OUTFILE);
$~ = "BEGIN_HTML";
select ($oldhandle);
write OUTFILE;
my $i=0;
while (my $test_page = <INREPOSIT>) {
chomp $test_page;
my $request = new HTTP::Request("GET","$test_page");
my $ua = new LWP::UserAgent;
my $response = $ua -> request($request);
if ($i % 5 == 0) { print OUTFILE "\n"};
$i++;
printf OUTFILE ("<DT>%5d",$i);
if ($response->is_success) {
print OUTFILE qq{ <A HREF="},$test_page, qq{">},$test_page,"</A>\n";
my $parser = HTML::LinkExtor->new(\&test_urls_cb,"$test_page");
$parser->parse($response->content);
} else {
print OUTFILE "--NOT--$test_page\n";
}
}
$oldhandle = select(OUTFILE);
$~ = "END_HTML";
select ($oldhandle);
write OUTFILE;
unlink ($reposit) or warn "Could not unlink $reposit: $!\n";
sub get_test_pages_cb {
my ($tag, %attributes) = @_;
return unless ($tag =~ m/a/i);
foreach my $name (sort keys %attributes) {
my $url = $attributes{$name};
next unless ($url =~ m/^http:/i);
print REPOSIT $url,"\n";
}
}
sub test_urls_cb {
my ($tag, %attributes) = @_;
return unless ($tag =~ m/a/i);
foreach my $name (sort keys %attributes) {
my $url = $attributes{$name};
next unless ($url =~ m/^http:/i);
# print $url,"\n";
my $request = new HTTP::Request("GET","$url");
my $ua = new LWP::UserAgent;
my $response = $ua -> request($request);
if ($response->is_success) {
# Add commands here is link is successful
# print OUTFILE "<DD>$url\n";
} else {
print OUTFILE "<DD> --BAD--$url\n";
}
}
}
sub wanted {
if (/\.html$/) {
my $temp = $File::Find::name;
$temp =~ s/\Q$base_dir//;
print REPOSIT $base_url , $temp , "\n";
}
}
format BEGIN_HTML =
<html>
<head>
<title>Dead Links</title>
<body>
<h1>@<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< Fix Your Links Map</h1>
$base_url
Lynx Rules! <BR>
<PRE>
.
format END_HTML =
</PRE>
<P>
Originally developed by: <br>
<address>
<a href="mailto:brian@brie.com">Brian Lavender</a>
"@"
</address>
</html>
.
-- Brian Lavender http://www.brie.com/brian/ **************************************************************************** * To UNSUBSCRIBE from the list, send a message with "unsubscribe lug-nuts" * in the message body to majordomo@saclug.org. Please direct other * questions, comments, or problems to lug-nuts-owner@saclug.org.
This archive was generated by hypermail 2b29 : Fri Feb 25 2000 - 14:29:09 PST