Counting lines of a CSV file in Perl

Hi there,

I have a PERL script in which I'm trying to modify.

I would like the script to record the number of rows (without the header row) as a variable called $peaks for each CSV file, somewhere around line 35 in the script:

####################
#FOR EACH CSV FILE:
####################

foreach my $fil (@files) {

    # get wellposition from filename
    my ($wellposition) = $fil =~ m{^\d+_\w+_([A-P]\d+)\.csv$};
    open my $in, '<', $fil or die "could not open $fil: $!";

    # record number of rows in CSV file (not including header)
    [b]line of code here: $peaks = no. rows in file (exclude header)[/b]

Open in new window



This variable will be output for each result in the script:

line 167: print {$out} "Wellposition ($wellposition) Results [b]$peaks[/b] peaks:\n\n",

Open in new window


The complete script is coded below.

Thanks,

Stephen.



#!/usr/bin/perl
use strict;
use warnings;

my $len = 0; # hack global

##########################################################################
#Script to identify animal species using monoisotopic peak markers against
#MS data
##########################################################################

# define directory
my $dir = 'C:/Users/Stephen/Desktop/test2/relmonopeaklists';
chdir $dir or die "could not cd to $dir: $!";

# create or overwrite SpeciesId
open my $out, '>', 'SpeciesId' or die "could not write SpeciesId: $!";

##########################################################################
#FILE HANDLING
##########################################################################

# get the list of csv files
opendir DIR, '.' or die "could not open dir: $!";
my @files = sort grep m{^\d+_\w+_[A-P]\d+\.csv$}, readdir DIR;
closedir DIR;

####################
#FOR EACH CSV FILE:
####################

foreach my $fil (@files) {
    # get wellposition from filename
    my ($wellposition) = $fil =~ m{^\d+_\w+_([A-P]\d+)\.csv$};
    open my $in, '<', $fil or die "could not open $fil: $!";
    
# record all masses from the file
#    my %masses;
#    while (<$in>) {
#        chomp;
#        # skip header line
#        next if m{mass.*intensity};
#        my ($mass) = split /,/;
#        unless ($mass =~ m{^\d+(?:\.\d+)$}) {
#            warn "mass ($mass) not a recognized number - #skipping";
#            next;
#        }
#        $mass = round($mass);
#        $masses{$mass}++;
#    }
#    close $in;
#    # pass masses hash to subroutine
#    my $data = analyze(\%masses);
#    output($wellposition, $data);
#}
#
#close $out;

# record all masses from the file
    my %masses;
    my @top;
    while (<$in>) {
        chomp;
        # skip header line
        next if m{mass.*intensity};
        my ($mass,$intensity) = split /,/;
        unless ($mass =~ m{^\d+(?:\.\d+)$}) {
            warn "mass ($mass) not a recognized number - skipping";
            next;
        }
       push @top,[$mass,$intensity];

    }
    close $in;
   $masses{round($_->[0])}++ for (sort{$b->[1]<=>$a->[1]}@top[0..49])[0..4];
    # pass masses hash to subroutine
   
    my $data = analyze(\%masses);
    output($wellposition, $data);
}

close $out;

##########################################################################
#SUB-ROUTINES
##########################################################################

sub round {
    my ($num) = @_;
    my ($start, $dig) = $num =~ m{^(\d+(?:\.\d)?)(\d)?};
    $start += 0.1 if (defined $dig and $dig >= 5);

#add .0 to end of number if no decimal
    $start .= '.0' unless ($start =~ m{\.\d$});
    return $start;
}

# main sub
{ # closure
# keep %species local to sub-routine but only init it once
my %species;

my $Z='Z';
sub _init {

    open my $in, '<', 'Species_Int.txt' or die "could not open Species_Int.txt: $!";
    my $spec;
    while (<$in>) {
        chomp;
        next if /^\s*$/; # skip blank lines
        if (m{^([A-Z]?)\s*=?\s*(\d+(?:\.\d)?)(?:\s+AND\s+(\d+(?:\.\d)?))?\s*$}) {
            # handle letter = lines
            push @{$species{$spec}{$1||++$Z}}, $2;
            push @{$species{$spec}{$1||$Z}}, $3 if $3;
        } else {
            # handle species name lines
            $spec = $_;
            $len = length($spec) if (length($spec) > $len);
        }
    }
    close $in;
}

sub analyze {
    my ($masses) = @_;
    _init() unless %species;
    my %data;
    # loop over species entries
SPEC:
    foreach my $spec (keys %species) {
        # loop over each letter of a species
LTR:
        foreach my $ltr (keys %{$species{$spec}}) {
            # loop over each mass for a letter
            foreach my $mass (@{$species{$spec}{$ltr}}) {
                # skip to next letter if it is not found
                next LTR unless exists($masses->{$mass});
            }
            # find all mass values for the species
            $data{$spec}{cnt}++;
        }
    }
    # add percentages
    foreach my $spec (keys %data) {
        $data{$spec}{pct} = round($data{$spec}{cnt} / scalar(keys %{$species{$spec}}) * 100);
    }
    return \%data;
}
} # end closure

##########################################################################
#RESULTS
##########################################################################

{ # begin closure
my $data;
sub _cust_sort {
    if ($data->{$b}{pct} == $data->{$a}{pct}) {
        return $data->{$b}{cnt} <=> $data->{$a}{cnt};
    }
    return $data->{$b}{pct} <=> $data->{$a}{pct};
}
sub output {
    my $wellposition = shift;
    $data = shift;
    my @order = sort _cust_sort keys %$data;
    print {$out} "Wellposition ($wellposition) Results:\n\n",
                 "Top 5 Species Identities:\n";

    # print out the top 5
    for my $i (0..4) {
        my $spec = $order[$i];
        unless ($order[$i]) {
            print "no more matches\n";
            last; # exit loop
        }
        printf {$out} "%d) %-${len}s  %d matches  %0.1f%%\n", $i+1, $spec, $data->{$spec}{cnt}, $data->{$spec}{pct};
    }
}
} # end closure

Open in new window

StephenMcGowanAsked:
Who is Participating?
I wear a lot of hats...

"The solutions and answers provided on Experts Exchange have been extremely helpful to me over the last few years. I wear a lot of hats - Developer, Database Administrator, Help Desk, etc., so I know a lot of things but not a lot about one thing. Experts Exchange gives me answers from people who do know a lot about one thing, in a easy to use platform." -Todd S.

ozoCommented:
foreach my $fil (@files) {
    # get wellposition from filename
    my ($wellposition) = $fil =~ m{^\d+_\w+_([A-P]\d+)\.csv$};
    open my $in, '<', $fil or die "could not open $fil: $!";
   

# record all masses from the file
    my %masses;
    my @top;
    while (<$in>) {
        chomp;
        # skip header line
        next if m{mass.*intensity};
        my ($mass,$intensity) = split /,/;
        unless ($mass =~ m{^\d+(?:\.\d+)$}) {
            warn "mass ($mass) not a recognized number - skipping";
            next;
        }
       push @top,[$mass,$intensity];

    }
   $peaks = $. - 1;
# or, if you want the number of lines actually recorded;  $peaks = @top;

    close $in;
0

Experts Exchange Solution brought to you by

Your issues matter to us.

Facing a tech roadblock? Get the help and guidance you need from experienced professionals who care. Ask your question anytime, anywhere, with no hassle.

Start your 7-day free trial
StephenMcGowanAuthor Commented:
Thanks ozo,

It seems that if I add $peaks for the lines:

line 74: [b]$peaks[/b] = $. - 1;

and 

line 168: print {$out} "Wellposition ($wellposition) Results: [b]$peaks[/b] \n\n",

Open in new window


I need to define the explicit package name for $peaks.

I've tried adding:   my $peaks; to define $peaks but I'm receiving the error message:

"Use of uninitialized value $peaks in concatenation (.) or string at Id_script.pl"

Where should I define $peaks within the script?

Thanks again,
0
ozoCommented:
I would suggest passing it

line 79: output($wellposition, $data, scalar @top);

line 166:  my $peaks = shift;
0
JavaScript Best Practices

Save hours in development time and avoid common mistakes by learning the best practices to use for JavaScript.

StephenMcGowanAuthor Commented:
Thanks again ozo, works a treat :)

Just wondering, I have a very similar script, and would like to do the same thing with this:

#!/usr/bin/perl
use strict;
use warnings;

my $len = 0; # hack global because it's simpler

##########################################################################
#Script to identify animal species using monoisotopic peak markers against
#MS data
##########################################################################

# forward slashes in dir name should work
my $dir = 'C:/Users/Stephen/Desktop/monopd/monopeaklists';
chdir $dir or die "could not cd to $dir: $!";

# create or overwrite SpeciesId
open my $out, '>', 'SpeciesId' or die "could not write SpeciesId: $!";

##########################################################################
#FILE HANDLING
##########################################################################

# get the list of csv files
opendir DIR, '.' or die "could not open dir: $!";
my @files = sort grep m{^\d+_\w+_[A-P]\d+\.csv$}, readdir DIR;
closedir DIR;

####################
#FOR EACH CSV FILE:
####################

foreach my $fil (@files) {
    # get wellposition from filename
    my ($wellposition) = $fil =~ m{^\d+_\w+_([A-P]\d+)\.csv$};
    open my $in, '<', $fil or die "could not open $fil: $!";

    # record all masses from the file
    my %masses;
    while (<$in>) {
        chomp;
        # skip header line
        next if m{mass.*intensity};
        my ($mass) = split /,/;
        unless ($mass =~ m{^\d+(?:\.\d+)$}) {
            warn "mass ($mass) not a recognized number - skipping";
            next;
        }
        $mass = round($mass);
        $masses{$mass}++;
    }
    close $in;
    # pass masses hash to subroutine
    my $data = analyze(\%masses);
    output($wellposition, $data);
}

close $out;

##########################################################################
#SUB-ROUTINES
##########################################################################

sub round {
    my ($num) = @_;
    my ($start, $dig) = $num =~ m{^(\d+(?:\.\d)?)(\d)?};
    $start += 0.1 if (defined $dig and $dig >= 5);
    # XXX - you probably want one of these two uncommented
    # remove .0 from end of number
    # $start =~ s{\.0$}{};
    # add .0 to end of number if no decimal
     $start .= '.0' unless ($start =~ m{\.\d$});
    return $start;
}

# main sub
{ # closure
# keep %species local to sub-routine but only init it once
my %species;
sub _init {
    open my $in, '<', 'SpeciesId_rodents.txt' or die "could not open SpeciesId_rodents.txt: $!";
    my $spec;
    while (<$in>) {
        chomp;
        next if /^\s*$/; # skip blank lines
        if (m{^([A-Z])\s*=\s*(\d+(?:\.\d)?)(?:\s+AND\s+(\d+(?:\.\d)?))?$}) {
            # handle letter = lines
            $species{$spec}{$1} = [$2];
            push @{$species{$spec}{$1}}, $3 if $3;
        } else {
            # handle species name lines
            $spec = $_;
            $len = length($spec) if (length($spec) > $len);
        }
    }
    close $in;
}
sub analyze {
    my ($masses) = @_;
    _init() unless %species;
    my %data;
    # loop over species entries
SPEC:
    foreach my $spec (keys %species) {
        # loop over each letter of a species
LTR:
        foreach my $ltr (keys %{$species{$spec}}) {
            # loop over each mass for a letter
            foreach my $mass (@{$species{$spec}{$ltr}}) {
                # skip to next letter if it is not found
                next LTR unless exists($masses->{$mass});
            }
            # if we get here, all mass values were found for the species/letter
            $data{$spec}{cnt}++;
        }
    }
    # add percentages
    foreach my $spec (keys %data) {
        $data{$spec}{pct} = round($data{$spec}{cnt} / scalar(keys %{$species{$spec}}) * 100);
    }
    return \%data;
}
} # end closure

##########################################################################
#RESULTS
##########################################################################

{ # begin closure
my $data;
sub _cust_sort {
    if ($data->{$b}{pct} == $data->{$a}{pct}) {
        return $data->{$b}{cnt} <=> $data->{$a}{cnt};
    }
    return $data->{$b}{pct} <=> $data->{$a}{pct};
}
sub output {
    my $wellposition = shift;
    $data = shift;
    my @order = sort _cust_sort keys %$data;
    print {$out} "Wellposition ($wellposition) Results:\n\n",
                 "Top 5 Species Identities:\n";
    # print out the top 5
    for my $i (0..4) {
        my $spec = $order[$i];
        unless ($order[$i]) {
            print "no more matches\n";
            last; # exit loop
        }
        printf {$out} "%d) %-${len}s  %d matches  %0.1f%%\n", $i+1, $spec, $data->{$spec}{cnt}, $data->{$spec}{pct};
    }
}
} # end closure

Open in new window


This script doesn't use @top, it only uses $mass and $masses:

# record all masses from the file
    my %masses;
    while (<$in>) {
        chomp;
        # skip header line
        next if m{mass.*intensity};
        my ($mass) = split /,/;
        unless ($mass =~ m{^\d+(?:\.\d+)$}) {
            warn "mass ($mass) not a recognized number - skipping";
            next;
        }
        $mass = round($mass);
        $masses{$mass}++;
    }
    close $in;

If I were to determine $peaks =$.-1, how would this be passed further down the script (previously you used scalar @top to pass this value).


Thanks again, really appreciate it.
0
ozoCommented:
output($wellposition, $data, $peaks);
0
StephenMcGowanAuthor Commented:
Thanks ozo,

I've tried using $peaks,

However, the script is currently falling over, I think it's due to $peaks.

The error I'm getting is now:

"Can't use string ("92") as a HASH ref while "strict refs" in use at Script.pl line 141"

The line in question is this:

line 139: my $peaks = shift;
line 140: $data = shift;
line 141: my @order = sort _cust_sort keys %$data;


I'm fairly certain ("92") is $peaks (as the first CSV file contains 92 rows).


The full code is below.

Cheers again.

#!/usr/bin/perl
use strict;
use warnings;

my $len = 0; # hack global because it's simpler

##########################################################################
#Script to identify animal species using monoisotopic peak markers against
#MS data
##########################################################################

# forward slashes in dir name should work
my $dir = 'C:/Users/Stephen/Desktop/monopd/monopeaklists';
chdir $dir or die "could not cd to $dir: $!";

# create or overwrite SpeciesId
open my $out, '>', 'SpeciesId' or die "could not write SpeciesId: $!";

##########################################################################
#FILE HANDLING
##########################################################################

# get the list of csv files
opendir DIR, '.' or die "could not open dir: $!";
my @files = sort grep m{^\d+_\w+_[A-P]\d+\.csv$}, readdir DIR;
closedir DIR;

####################
#FOR EACH CSV FILE:
####################

foreach my $fil (@files) {
    # get wellposition from filename
    my ($wellposition) = $fil =~ m{^\d+_\w+_([A-P]\d+)\.csv$};
    open my $in, '<', $fil or die "could not open $fil: $!";
    # record all masses from the file
    my %masses;
    my $peaks;
    while (<$in>) {
        chomp;
        # skip header line
        next if m{mass.*intensity};
        my ($mass) = split /,/;
        unless ($mass =~ m{^\d+(?:\.\d+)$}) {
            warn "mass ($mass) not a recognized number - skipping";
            next;
        }
        $mass = round($mass);
        $masses{$mass}++;
        $peaks = $. - 1;
    }
    close $in;
    # pass masses hash to subroutine
    my $data = analyze(\%masses);
    output($wellposition, $data, $peaks);
}

close $out;

##########################################################################
#SUB-ROUTINES
##########################################################################

sub round {
    my ($num) = @_;
    my ($start, $dig) = $num =~ m{^(\d+(?:\.\d)?)(\d)?};
    $start += 0.1 if (defined $dig and $dig >= 5);
    # XXX - you probably want one of these two uncommented
    # remove .0 from end of number
    # $start =~ s{\.0$}{};
    # add .0 to end of number if no decimal
     $start .= '.0' unless ($start =~ m{\.\d$});
    return $start;
}

# main sub
{ # closure
# keep %species local to sub-routine but only init it once
my %species;
sub _init {
    open my $in, '<', 'SpeciesId_rodents.txt' or die "could not open SpeciesId_rodents.txt: $!";
    my $spec;
    while (<$in>) {
        chomp;
        next if /^\s*$/; # skip blank lines
        if (m{^([A-Z])\s*=\s*(\d+(?:\.\d)?)(?:\s+AND\s+(\d+(?:\.\d)?))?$}) {
            # handle letter = lines
            $species{$spec}{$1} = [$2];
            push @{$species{$spec}{$1}}, $3 if $3;
        } else {
            # handle species name lines
            $spec = $_;
            $len = length($spec) if (length($spec) > $len);
        }
    }
    close $in;
}
sub analyze {
    my ($masses) = @_;
    _init() unless %species;
    my %data;
    # loop over species entries
SPEC:
    foreach my $spec (keys %species) {
        # loop over each letter of a species
LTR:
        foreach my $ltr (keys %{$species{$spec}}) {
            # loop over each mass for a letter
            foreach my $mass (@{$species{$spec}{$ltr}}) {
                # skip to next letter if it is not found
                next LTR unless exists($masses->{$mass});
            }
            # if we get here, all mass values were found for the species/letter
            $data{$spec}{cnt}++;
        }
    }
    # add percentages
    foreach my $spec (keys %data) {
        $data{$spec}{pct} = round($data{$spec}{cnt} / scalar(keys %{$species{$spec}}) * 100);
    }
    return \%data;
}
} # end closure

##########################################################################
#RESULTS
##########################################################################

{ # begin closure
my $data;
sub _cust_sort {
    if ($data->{$b}{pct} == $data->{$a}{pct}) {
        return $data->{$b}{cnt} <=> $data->{$a}{cnt};
    }
    return $data->{$b}{pct} <=> $data->{$a}{pct};
}
sub output {
    my $wellposition = shift;
    my $peaks = shift;
    $data = shift;
    my @order = sort _cust_sort keys %$data;
    print {$out} "Wellposition ($wellposition) Results:($peaks peaks) \n\n",
                 "Top 5 Species Identities:\n";
    # print out the top 5
    for my $i (0..4) {
        my $spec = $order[$i];
        unless ($order[$i]) {
            print "no more matches\n";
            last; # exit loop
        }
        printf {$out} "%d) %-${len}s  %d matches  %0.1f%%\n", $i+1, $spec, $data->{$spec}{cnt}, $data->{$spec}{pct};
    }
}
} # end closure

Open in new window

0
ozoCommented:
$data = shift;
  my $peaks = shift;
0
StephenMcGowanAuthor Commented:
Thanks a lot ozo! :)
0
It's more than this solution.Get answers and train to solve all your tech problems - anytime, anywhere.Try it for free Edge Out The Competitionfor your dream job with proven skills and certifications.Get started today Stand Outas the employee with proven skills.Start learning today for free Move Your Career Forwardwith certification training in the latest technologies.Start your trial today
Perl

From novice to tech pro — start learning today.