kilkennp
asked on
Don't understand output
this code is supposed to take in two files one which has a language in the form of:
in 0.021268266230653
er 0.0199459816856101
an 0.0180031452539636
he 0.0167558696165399
on 0.0161781277728501
th 0.0155111263808014
re 0.0127007970702476
and another which takes in song titles like:
this is a song title1 (4.30)
this is a song title2 (4.30)
I want to split the information from the second file (per word) into bigrams and then get the corresponding frequencies from my language model then calculte the probability of that word being in the model. so (4.30) should return a probability of 0 but it doesn't. Anyone know why?
How is the code? Is there any way I could tidy it up a bit?
thanks.
Code:
#!usr/bin/perl
use strict;
use warnings;
use diagnostics;
use POSIX qw(log10);
use FileHandle;
#open file handles for languagemodel and test file
my $fh = new FileHandle;
my $fh2 = new FileHandle;
$fh->open("<$ARGV[0]") or die "could not open file\n";
$fh2->open("<$ARGV[1]") or die "could not open file\n";
#open a file to output the results to
open(OUTTRACKS,">trackbigr ams") or die "could not open trackbigrams\n";
my $line;
my %bigramfrequency;
my $totalbigram;
my %bititles;
my @titlesbigram;
my @bifrequency;
my @frequency;
my @letterline;
my $result;
my $word;
#subroutine to get bigrams
sub bigram()
{
for (my $i=0; $i <= $#letterline-1; $i++)
{
my $bigram = $letterline[$i] . $letterline[$i+1];
$bigramfrequency{$bigram}+ +;
$totalbigram++;
}
$_ /= $totalbigram foreach values %bigramfrequency;
return %bigramfrequency;
}
#read in the language model values into a hash table
sub buildlanguage($)
{
my $filehandle = shift;
my %languagemodel;
while (<$filehandle>)
{
chomp;
my ($key, $value) = split /\s/, $_, 2;
$languagemodel{$key} = $value;
}
return %languagemodel;
}
#build the language model from the first file passed in
my %model = buildlanguage($fh);
#subroutine to get the corresponding frequency for the bigrams
sub lookupfreq
{
for (my $i=0; $i<@titlesbigram; $i++) {
if (exists $model{$titlesbigram[$i]})
{
$frequency[$i] = $model{$titlesbigram[$i]};
}
else
{
$frequency[$i] = 0;
}
}
return @frequency;
}
#calculate the probability
sub getProbability
{
my $run_total;
foreach (@bifrequency)
{
$run_total += $_;
}
my $nth_root = scalar(@bifrequency);
my $log_e = log10($run_total);
my $prob = exp($log_e/$nth_root);
return $prob;
}
#open a file handle for the test file
my $fileHandle2 = $fh2;
while (<$fileHandle2>)
{
$line = $_;
chomp $line;
$line = lc($line);
#wordline contains each word of the string
my @wordline = split /[^\p{IsL}\d()]+/, $line;
foreach $word (@wordline)
{
#letter line contains just a letter
@letterline = split //, $word;
%bititles = bigram();
@titlesbigram = keys (%bititles);
@bifrequency = lookupfreq();
$result = getProbability();
print OUTTRACKS "$word\t $result\n"
}
}
$fh->close();
$fh2->close();
close(OUTTRACKS);
in 0.021268266230653
er 0.0199459816856101
an 0.0180031452539636
he 0.0167558696165399
on 0.0161781277728501
th 0.0155111263808014
re 0.0127007970702476
and another which takes in song titles like:
this is a song title1 (4.30)
this is a song title2 (4.30)
I want to split the information from the second file (per word) into bigrams and then get the corresponding frequencies from my language model then calculte the probability of that word being in the model. so (4.30) should return a probability of 0 but it doesn't. Anyone know why?
How is the code? Is there any way I could tidy it up a bit?
thanks.
Code:
#!usr/bin/perl
use strict;
use warnings;
use diagnostics;
use POSIX qw(log10);
use FileHandle;
#open file handles for languagemodel and test file
my $fh = new FileHandle;
my $fh2 = new FileHandle;
$fh->open("<$ARGV[0]") or die "could not open file\n";
$fh2->open("<$ARGV[1]") or die "could not open file\n";
#open a file to output the results to
open(OUTTRACKS,">trackbigr
my $line;
my %bigramfrequency;
my $totalbigram;
my %bititles;
my @titlesbigram;
my @bifrequency;
my @frequency;
my @letterline;
my $result;
my $word;
#subroutine to get bigrams
sub bigram()
{
for (my $i=0; $i <= $#letterline-1; $i++)
{
my $bigram = $letterline[$i] . $letterline[$i+1];
$bigramfrequency{$bigram}+
$totalbigram++;
}
$_ /= $totalbigram foreach values %bigramfrequency;
return %bigramfrequency;
}
#read in the language model values into a hash table
sub buildlanguage($)
{
my $filehandle = shift;
my %languagemodel;
while (<$filehandle>)
{
chomp;
my ($key, $value) = split /\s/, $_, 2;
$languagemodel{$key} = $value;
}
return %languagemodel;
}
#build the language model from the first file passed in
my %model = buildlanguage($fh);
#subroutine to get the corresponding frequency for the bigrams
sub lookupfreq
{
for (my $i=0; $i<@titlesbigram; $i++) {
if (exists $model{$titlesbigram[$i]})
{
$frequency[$i] = $model{$titlesbigram[$i]};
}
else
{
$frequency[$i] = 0;
}
}
return @frequency;
}
#calculate the probability
sub getProbability
{
my $run_total;
foreach (@bifrequency)
{
$run_total += $_;
}
my $nth_root = scalar(@bifrequency);
my $log_e = log10($run_total);
my $prob = exp($log_e/$nth_root);
return $prob;
}
#open a file handle for the test file
my $fileHandle2 = $fh2;
while (<$fileHandle2>)
{
$line = $_;
chomp $line;
$line = lc($line);
#wordline contains each word of the string
my @wordline = split /[^\p{IsL}\d()]+/, $line;
foreach $word (@wordline)
{
#letter line contains just a letter
@letterline = split //, $word;
%bititles = bigram();
@titlesbigram = keys (%bititles);
@bifrequency = lookupfreq();
$result = getProbability();
print OUTTRACKS "$word\t $result\n"
}
}
$fh->close();
$fh2->close();
close(OUTTRACKS);
ASKER
I've looked at the problem some more and it seems that the problem is with the %bititles as you pointed out. Every time the sub bigram is called there are still bigrams remaining in the hash table from the previous word which messes up the results. Is there any way one can make sure this hash table is empty for each word it gets the bigrams for i.e. each iteration of the 'foreach $word (@wordline)' loop.
Alternatively, I could get rid of this hash table and put the bigrams into an array (@titlesbigram) because as you rightly pointed out I never use the values the in %bititles.....this is because I took this sub from another program I had wrote.
$result is a decemial value got from calling getProbability (). It takes an array of decmial values and calculates the probability.
thanks for any further help you can give me
Alternatively, I could get rid of this hash table and put the bigrams into an array (@titlesbigram) because as you rightly pointed out I never use the values the in %bititles.....this is because I took this sub from another program I had wrote.
$result is a decemial value got from calling getProbability (). It takes an array of decmial values and calculates the probability.
thanks for any further help you can give me
foreach ( split /[^\p{IsL}\d()]+/, $line ){
@bifrequency = map{$model{$_}||0}/(..)/g, /(?<=.)(.. )/g;
$result = getProbability();
print OUTTRACKS "$word\t $result\n"
}
@bifrequency = map{$model{$_}||0}/(..)/g,
$result = getProbability();
print OUTTRACKS "$word\t $result\n"
}
ASKER
I don't understand this code. What does it do? where in the original code should i put it?
it would replace
#wordline contains each word of the string
my @wordline = split /[^\p{IsL}\d()]+/, $line;
foreach $word (@wordline)
{
#letter line contains just a letter
@letterline = split //, $word;
%bititles = bigram();
@titlesbigram = keys (%bititles);
@bifrequency = lookupfreq();
$result = getProbability();
print OUTTRACKS "$word\t $result\n"
}
}
to generate @bifrequency
(although I'm still not sure why getProbability would be doing exp(log10())
#wordline contains each word of the string
my @wordline = split /[^\p{IsL}\d()]+/, $line;
foreach $word (@wordline)
{
#letter line contains just a letter
@letterline = split //, $word;
%bititles = bigram();
@titlesbigram = keys (%bititles);
@bifrequency = lookupfreq();
$result = getProbability();
print OUTTRACKS "$word\t $result\n"
}
}
to generate @bifrequency
(although I'm still not sure why getProbability would be doing exp(log10())
ASKER
sorry but I still don't understand your piece of code. Where are the bigrams made? When is the corresponding frequency got? can I not still use the subroutines that I wrote?
The getProbability should take an array of frequencies of the bigrams for a word. it is to calculate the nth root of the sum of the frequencies passed in in the array.
so if we take the string "hello world", I want to take the first word and get the bigrams i.e. "he, el, ll, lo" for each of these I want to get their corresponding value from the %model which will be less than 1. I then want to pass these frequencies into getprobability to sum them up and get the nth root of them. that is what result should be then. I want to do this for every word in the string.
The getProbability should take an array of frequencies of the bigrams for a word. it is to calculate the nth root of the sum of the frequencies passed in in the array.
so if we take the string "hello world", I want to take the first word and get the bigrams i.e. "he, el, ll, lo" for each of these I want to get their corresponding value from the %model which will be less than 1. I then want to pass these frequencies into getprobability to sum them up and get the nth root of them. that is what result should be then. I want to do this for every word in the string.
ASKER CERTIFIED SOLUTION
membership
Create a free account to see this answer
Signing up is free and takes 30 seconds. No credit card required.
exp(log10($run_total)/$nth
I don't know what you would expect such a quantity to represent.
#subroutine to get bigrams
sub bigram()
{
for (my $i=0; $i <= $#letterline-1; $i++)
{
my $bigram = $letterline[$i] . $letterline[$i+1];
$bigramfrequency{$bigram}+
$totalbigram++;
}
$_ /= $totalbigram foreach values %bigramfrequency;
return %bigramfrequency;
}
once a bigram goes into bigramfrequency, it stay there forever, so keys (%bititles) is every bigram ever seen in $fh2
and every entry is divided by $totalbigram every time you call sub bigram, so some entrys can get very small.
Which seems to be pointless since I don't see the values in %bititles being used anywhere)
What is the formula you are trying to implement in computing $result?