#!/usr/bin/perl # # Program Name: getPage.pl # Use: getPage.pl url # description: This program uses the LWP::Simple module. It will # get a web page passed to it on the command line then strip out # and list all the urls on that page, returning the list to STDOUT. # The program will canonize the url. For example http://x.y.z will # have a slash appended to it, while http://x.y.z/mypage.html will not. # Also, local references will be expanded to include the source page url. # For example: href="localpage.htm" will have http://x.y.z/ prepended to # the local reference. # # ########### use strict; use LWP::Simple; my $pg; my $basepg; my $url; # Get the URL or exit with useage message if (! defined $ARGV[0]) { print "Usage is 'getPageURL.pl URL'\n"; exit; }else{ $pg = $ARGV[0]; chomp($pg); } # Canonize the initial url # If a specific html file is specified, get the base page... if ($pg =~ /html$|htm$/) { $pg =~ m"([\d\w\.:/]+/)"; $basepg = $1; }else{ if ( $pg !~ m"/$"i ) { $basepg = $pg = $pg . "/"; }else{ $basepg = $pg; } } # Go get the page. This "get" is made available by LWP::Simple my $webpg = get($pg) or die "no page $pg\n"; # Split page into lines load @webpgArry my (@webpgArry) = split(/\n/, $webpg); # Strip out the urls named on the page load into @urls my @urls = map { /href="(.+?)"/ } @webpgArry; # ? for minimal match otw can match target="_blank" etc # de-localize a local page url if necessary and print, but don't print mailto links foreach $url (@urls) { next if ($url =~ /mailto/i); if ($url !~ /http|https|ftp/i) { print $basepg . $url . "\n"; }else{ print $url . "\n"; } } exit;