Short PERL SOURCE EXAMPLE
This PERL example is a little program that gets source code for a web
page specified in its command line, strips the urls from that page and
prints them to STDOUT.
#!/usr/bin/perl
#
# Program Name: getPage.pl
# Use: getPage.pl url
# description: This program uses the LWP::Simple module. It will
# get a web page passed to it on the command line then strip out
# and list all the urls on that page, returning the list to STDOUT.
# The program will canonize the url. For example http://x.y.z will
# have a slash appended to it, while http://x.y.z/mypage.html will not.
# Also, local references will be expanded to include the source page url.
# For example: href="localpage.htm" will have http://x.y.z/ prepended to
# the local reference.
#
#
###########
use strict;
use LWP::Simple;
my $pg;
my $basepg;
my $url;
# Get the URL or exit with useage message
if (! defined $ARGV[0])
{
print "Usage is 'getPageURL.pl URL'\n";
exit;
}else{
$pg = $ARGV[0];
chomp($pg);
}
# Canonize the initial url
# If a specific html file is specified, get the base page...
if ($pg =~ /html$|htm$/)
{
$pg =~ m"([\d\w\.:/]+/)";
$basepg = $1;
}else{
if ( $pg !~ m"/$"i )
{
$basepg = $pg = $pg . "/";
}else{
$basepg = $pg;
}
}
# Go get the page. This "get" is made available by LWP::Simple
my $webpg = get($pg) or die "no page $pg\n";
# Split page into lines load @webpgArry
my (@webpgArry) = split(/\n/, $webpg);
# Strip out the urls named on the page load into @urls
my @urls = map { /href="(.+?)"/ } @webpgArry; # ? for minimal match otw can match target="_blank" etc
# de-localize a local page url if necessary and print, but don't print mailto links
foreach $url (@urls)
{
next if ($url =~ /mailto/i);
if ($url !~ /http|https|ftp/i)
{
print $basepg . $url . "\n";
}else{
print $url . "\n";
}
}
exit;
Mail to mjr