• Status: Solved
  • Priority: Medium
  • Security: Public
  • Views: 242
  • Last Modified:

HTML:Parser cgi script

I need someone to show me how to write a cgi script to run on windows & unix called from browser, the code needs to loop through a list (list.dat) of urls ie:
http://www.awebsite.com
http://www.awebsite2.co.uk
http://www.awebsite3.net
http://www.awebsite4.org

call up the url's and write/print the following to a tab delimeted file, info.txt

1. The first 8 characters of the url ie: ' awebsite ' or everything between www. and .com (or whatever after . ie .net .co.uk .org) if the address has less than 8 characters. If the address is a repeat then add number
(incremented) ' awebsite_1 '.
2. The 'url'
3. The 'title'
4. The 'action' ie http://www.awebsite.com/cgi-bin/script.cgi
5. The method ie: POST
6. The 'input' name - values ie:
<input type=hidden name='user' value='JohnGreen'>
<input type=hidden name='state' value='NewYork'>
user=JohnGreen,state=NewYork
7. Print a line ------------- between each loop

Write all the above to the browser

write 1. to another file info2.txt ie:
<option value="awebsite">awebsite:General: FFS</option>
0
malkie
Asked:
malkie
  • 7
  • 5
1 Solution
 
ozoCommented:
Where do the 'title', 'action', and 'input' name - values come from?
0
 
TintinCommented:
Your specification is reasonably clear apart from the stuff ozo mentioned.  

Once you've supplied that, the rest of it is pretty straight forward.
0
 
malkieAuthor Commented:
The 'title', 'action', and 'input' name - values comes from the html source code of the url which is called. So if no 'title' then the script would write 'no title' to the file and so on.
0
Receive 1:1 tech help

Solve your biggest tech problems alongside global tech experts with 1:1 help.

 
ozoCommented:
use LWP::Simple;
require HTML::Parser;
{
     package MyParser;
     use base qw(HTML::Parser);
    sub start {
        my ($self,$tag, $attr, $attrseq, $origtext) = @_;
        $in{$tag}++;
        if( $tag eq "form" ){
            print "action: $attr->{action}\n";
            print "method: $attr->{method}\n";
        }
        if( $tag eq "input" ){
            print "$origtext\n";
        }
        if( $tag eq "oprion" ){
            $option=$origtext;
        }

    }
    sub end {
        my ($self,$tag, $attr, $attrseq, $origtext) = @_;
        $in{$tag}--;
        if( $tag eq "title" ){
            $title ||= 'no title';
            print "title: $title\n";
            $option = qq(<option value="$main::u">$main::u:$title</option>\n);
            $title='';
        }
    }
    sub text {
        my ($self,$text) = @_;
        if( $in{title} ){
                $title .= $text;
        }
    }
}

sub start {
  my ($tag, $attr, $attrseq) = @_;

}
$p = new MyParser;
open L,"<list.dat" or die "Can't open list.dat $!";
open I1,">info.txt" or die "Can't open info.txt $!";
open I2,">info2.txt" or die "Can't open info2.txt $!";
select I1;

while( <L> ){
     chomp;
     ($u) = m'//(?:www\.)?([^.]{0,8})';
     if( $n = $n{$u}++ ){ $u .= "_$n"; }
     next unless $content = get($_);
     print I1 $sep;
     $sep = "-------------\n";
     print I1 "$_\n";
     $p->parse($content);
     print I2 $MyParser::option;
}
0
 
malkieAuthor Commented:
I ran the script calling " ozo1.cgi " from the browser on windows and it came up with:-
CGI Error
The specified CGI application misbehaved by not returning a complete set of HTTP headers. The headers it did return are:

Can't open list.dat No such file or directory at C:\Inetpub\wwwroot\cgi-bin\extracthtml\ozo1.cgi line 44.

I added:-
# Absolute path to list.dat file:
my $infile = "/Inetpub/wwwroot/cgi-bin/extracthtml/list.dat";

#open L,"<list.dat" or die "Can't open list.dat $!";
open L,"<$infile" or die "Can't open list.dat $!";

I ran the script and it came up with:-
CGI Error
The specified CGI application misbehaved by not returning a complete set of HTTP headers. The headers it did return are:
0
 
ozoCommented:
use CGI qw/:standard/;
use CGI::Carp 'fatalsToBrowser';

my $infile = "/Inetpub/wwwroot/cgi-bin/extracthtml/list.dat";

open L,"<$infile" or die "Can't open $infile $!";
print  header,
       start_html('HTML:Parser cgi script');
print "<pre>";

while( <L> ){
    chomp;
    ($u) = m'//(?:www\.)?([^.]{0,8})';
    if( $n = $n{$u}++ ){ $u .= "_$n"; }
    next unless $content = get($_);
    print "$sep$_\n";
    print I1 "$sep$_\n";  
    $sep = "-------------\n";
    $p->parse($content);
    select I1;
    $p->parse($content);
    select STDOUT;
    print I2 $MyParser::option;
}
print "</pre>";
print end_html;
0
 
malkieAuthor Commented:
As code below, it prints to browser.
It is not writing to files, I tried to put this code in various places???

#### WHERE TO PUT THIS CODE IT IS NOT WRITING TO FILES
open I1,">info.txt" or die "Can't open info.txt $!";
open I2,">info2.txt" or die "Can't open info2.txt $!";
select I1;

--------
 
use LWP::Simple;
require HTML::Parser;
use CGI qw/:standard/;
use CGI::Carp 'fatalsToBrowser';

# Absolute path to list.dat file:
my $infile = "/Inetpub/wwwroot/cgi-bin/extracthtml/list.dat";
{
    package MyParser;
    use base qw(HTML::Parser);
   sub start {
       my ($self,$tag, $attr, $attrseq, $origtext) = @_;
       $in{$tag}++;
       if( $tag eq "form" ){
           print "action: $attr->{action}\n";
           print "method: $attr->{method}\n";
       }
       if( $tag eq "input" ){
           print "$origtext\n";
       }
       if( $tag eq "option" ){
           $option=$origtext;
       }

   }
   sub end {
       my ($self,$tag, $attr, $attrseq, $origtext) = @_;
       $in{$tag}--;
       if( $tag eq "title" ){
           $title ||= 'no title';
           print "title: $title\n";
           $option = qq(<option value="$main::u">$main::u:$title</option>\n);
           $title='';
       }
   }
   sub text {
       my ($self,$text) = @_;
       if( $in{title} ){
               $title .= $text;
       }
   }
}

sub start {
 my ($tag, $attr, $attrseq) = @_;

}
$p = new MyParser;
#open L,"<list.dat" or die "Can't open list.dat $!";

open L,"<$infile" or die "Can't open list.dat $!";
print  header,
      start_html('HTML:Parser cgi script');
print "<pre>";
while( <L> ){
    chomp;
    ($u) = m'//(?:www\.)?([^.]{0,8})';
    if( $n = $n{$u}++ ){ $u .= "_$n"; }
    next unless $content = get($_);
    print I1 $sep;
    $sep = "-------------\n";
    print I1 "$_\n";
    $p->parse($content);
    print I2 $MyParser::option;
}
print "</pre>";
print end_html;

#### WHERE TO PUT THIS CODE IT IS NOT WRITING TO FILES
open I1,">info.txt" or die "Can't open info.txt $!";
open I2,">info2.txt" or die "Can't open info2.txt $!";
select I1;
0
 
malkieAuthor Commented:
I have tried to move this part of the code from where it is above:-
#### WHERE TO PUT THIS CODE IT IS NOT WRITING TO FILES
open I1,">info.txt" or die "Can't open info.txt $!";
open I2,">info2.txt" or die "Can't open info2.txt $!";
select I1;

but get the following error:-

CGI Error
The specified CGI application misbehaved by not returning a complete set of HTTP headers. The headers it did return are:
Prototype mismatch: sub main::head ($) vs none at C:/Perl/lib/CGI.pm line 231.
0
 
ozoCommented:
use CGI;
use CGI::Carp 'fatalsToBrowser';
use LWP::Simple;
require HTML::Parser;
{
    package MyParser;
    use base qw(HTML::Parser);
   sub start {
       my ($self,$tag, $attr, $attrseq, $origtext) = @_;
       $in{$tag}++;
       if( $tag eq "form" ){
           print "action: $attr->{action}\n";
           print "method: $attr->{method}\n";
       }
       if( $tag eq "input" ){
           print "$origtext\n";
       }
       if( $tag eq "oprion" ){
           $option=$origtext;
       }

   }
   sub end {
       my ($self,$tag, $attr, $attrseq, $origtext) = @_;
       $in{$tag}--;
       if( $tag eq "title" ){
           $title ||= 'no title';
           print "title: $title\n";
           $option = qq(<option value="$main::u">$main::u:$title</option>\n);
           $title='';
       }
   }
   sub text {
       my ($self,$text) = @_;
       if( $in{title} ){
               $title .= $text;
       }
   }
}


$p = new MyParser;

my $infile = "/Inetpub/wwwroot/cgi-bin/extracthtml/list.dat";
my $infofile = "/Inetpub/wwwroot/cgi-bin/extracthtml/info.txt";
my $infofile = "/Inetpub/wwwroot/cgi-bin/extracthtml/info2.txt";
open L,"<$infile" or die "Can't open $infile $!";
open I1,">$infofile" or die "Can't open $infofile $!";
open I2,">$info2file" or die "Can't open $info2file $!";
$q = new CGI;
print  $q->header,
       $q->start_html('HTML:Parser cgi script');
print "<pre>";

while( <L> ){
   chomp;
   ($u) = m'//(?:www\.)?([^.]{0,8})';
   if( $n = $n{$u}++ ){ $u .= "_$n"; }
   next unless $content = get($_);
   print "$sep$_\n";
   print I1 "$sep$_\n";  
   $sep = "-------------\n";
   $p->parse($content);
   select I1;
   $p->parse($content);
   select STDOUT;
   print I2 $MyParser::option;
}
print "</pre>";
print $q->end_html;
0
 
malkieAuthor Commented:
Sorry for my delay.

The snip from print below and what was required:-

info.txt is:-
http://yes/valuepairs1.html
title: test
action: http://yes/cgi-bin/ascript.pl
method: POST
<input type=hidden name='userdir' value='lnks'>
<input type=hidden name= lnkuser value= okay>
<input type=text name= emailid size=30>
<input type=text name="title" size=40>
<input type=text name="url" size=55>
<input type=submit value="Add">
<input type=reset>

required:-
yes/valu_1
http://yes/valuepairs1.html
test
http://yes/cgi-bin/ascript.pl
POST
<input type=hidden name='userdir' value='lnks'>
<input type=hidden name= lnkuser value= okay>
<input type=text name= emailid size=30>
<input type=text name="title" size=40>
<input type=text name="url" size=55>
<input type=submit value="Add">
<input type=reset>

info2.txt is:-
<option value="yes/valu_1">yes/valu_1:test</option>

required:-
<option value="yes/valu_1">yes/valu_1:General: FFS</option>
0
 
malkieAuthor Commented:
is it possible to make the two changes to the script???
0
 
ozoCommented:
use CGI;
use CGI::Carp 'fatalsToBrowser';
use LWP::Simple;
require HTML::Parser;
{
   package MyParser;
   use base qw(HTML::Parser);
  sub start {
      my ($self,$tag, $attr, $attrseq, $origtext) = @_;
      $in{$tag}++;
      if( $tag eq "form" ){
          print "$attr->{action}\n";
          print "$attr->{method}\n";
      }
      if( $tag eq "input" ){
          print "$origtext\n";
      }
      if( $tag eq "oprion" ){
          $option=$origtext;
      }

  }
  sub end {
      my ($self,$tag, $attr, $attrseq, $origtext) = @_;
      $in{$tag}--;
      if( $tag eq "title" ){
          $title ||= 'no title';
          print "title: $title\n";
          $option = qq(<option value="$main::u">$main::u:General: FFS</option>\n);
          $title='';
      }
  }
  sub text {
      my ($self,$text) = @_;
      if( $in{title} ){
          $title .= $text;
      }
  }
}


$p = new MyParser;

my $infile = "/Inetpub/wwwroot/cgi-bin/extracthtml/list.dat";
my $infofile = "/Inetpub/wwwroot/cgi-bin/extracthtml/info.txt";
my $info2file = "/Inetpub/wwwroot/cgi-bin/extracthtml/info2.txt";

open L,"<$infile" or die "Can't open $infile $!";
open I1,">$infofile" or die "Can't open $infofile $!";
open I2,">$info2file" or die "Can't open $info2file $!";
$q = new CGI;
print  $q->header,
      $q->start_html('HTML:Parser cgi script');
print "<pre>";

while( <L> ){
  chomp;
  ($u) = m'//(?:www\.)?([^.]{0,8})';
  if( $n = $n{$u}++ ){ $u .= "_$n"; }
  next unless $content = get($_);
  print "$sep$u\n$_\n";
  print I1 "$sep$u\n$_\n";  
  $sep = "-------------\n";
  $p->parse($content);
  select I1;
  $p->parse($content);
  select STDOUT;
  print I2 $MyParser::option;
}
print "</pre>";
print $q->end_html;
0
 
malkieAuthor Commented:
Thank you it works great. God Bless
0

Featured Post

Free Tool: IP Lookup

Get more info about an IP address or domain name, such as organization, abuse contacts and geolocation.

One of a set of tools we are providing to everyone as a way of saying thank you for being a part of the community.

  • 7
  • 5
Tackle projects and never again get stuck behind a technical roadblock.
Join Now