Go Premium for a chance to win a PS4. Enter to Win

x
  • Status: Solved
  • Priority: Medium
  • Security: Public
  • Views: 223
  • Last Modified:

How to parse a text file group by group in Perl?

This is a follow up question for ID: 27285438

This is the final code that I use. This code works very well except for the files section.


This is the code I use:

my $cache_data = submitFileParser($textFile);

sub submitFileParser ($) {
	my $filename = shift;
	my @paragraphs;
#	local($/) = '';
	open( FILE, "< $filename" ) or die "Can't open $filename : $!";
	@paragraphs = <FILE>;
	close FILE;
	return read_paragraphs (@paragraphs);
}


sub read_paragraphs (@) {
	# read lines as parameters
	my @rippedParagraphs = @_;
	my $submitFileExist = 0;
	my $submit_file = 0;
	my $noSubmitFileFlag = 0;
	
	#Decide if the file includes submit file or not
	for ( @rippedParagraphs ) {
	
		#obtain the login from USER=
		if (/^USER=(\S+)\,/) {
			$geckLogin = $1;
		}
		
		if (/^\s*Submit\s+file\s*$/) {
		$submitFileExist = 1;
		}
		if ($submitFileExist == 1) {
			if (/^\s*\=+\s*$/) {
			$submitFileExist = 2;
			}
		}
	}	
	
	if ($submitFileExist == 2){
	my @submits = ();
	# Storage for all sections
	# Temporary storages for single section of each type
	my (@Files, @CR, @RR, @CS, @Options, @Mailsent);
	# Flags for file traversal logic
	my ($opt_flag, $file_flag, $mail_sent_to_flag);

	$submit_file = 0;
	$noSubmitFileFlag = 0;
	#read the file
	for ( @rippedParagraphs ) {

		if (/^\s*Submit\s+file\s*$/) {
			# We record the accumulated data:
			push(
				@submits,
				{
					"Options"              => [@Options],
					"Files"                => [@Files],
					"Comments"             => [@CS],
					"RelatedRecords"       => [@RR],
					"CodeReviewers"        => [@CR],
					"GeckLogin" 		   => $geckLogin,
					"NoSubmitFileFlag"     => $noSubmitFileFlag,
					"Mail sent to"         => [@Mailsent],
				}
			) if @Files;
			@Options = @Files = @CR = @CS = @RR = ();
			$submit_file = 1;
			next;
		}
		if ($submit_file == 1) {
			if (/^\s*\=+\s*$/) {
				$submit_file++;
				$mail_sent_to_flag = 0;
			} else {
				$submit_file = 0; # two-line grammar didn't hold
			}
			next;
		}
		if ($submit_file == 2) {
			if ($mail_sent_to_flag) {
				push(@Mailsent, $_);
				next;
			}
			if (m|^\#\s*Sandbox\s+location\s*\:\s*\S*/sandbox/(.*?)/|) {
				# Match the login name in the submit file - if it has not
				# already been done
				$geckLogin ||= $1;
			}
			# If we encounter a comment or empty string
			if (/^\#/ || !/\S/) {
				# we haven't encountered an option to start doing anything
				next unless $opt_flag || $file_flag;
				# If we're done with options, let's start reading file sections
				if ($opt_flag == 1) {
					$opt_flag = 0;
					$file_flag = 1;
				}
				elsif ($opt_flag > 1) {
					# Addresses the empty line within Options:
					$opt_flag--;
				}
				next;
			}
			if (/^Options/) {
				# We start reading options
				$opt_flag = 2;
				next;
			}
			if (/^Mail sent to/) {
				$mail_sent_to_flag = 1;
				push(@Mailsent, $_);
				next;
			}
			# Matching beginning of the line to determine the type of the string
			# and placing it in temporary storage
			/^R(R|elated\sRecords):\s*(.*\n)/ && push(@RR, $2) && next;
			/^C(R|ode\sReviewer):\s*(.*\n)/ && push(@CR, $2) && next;
			/^C(S|omments):\s*(.*\n)/ && push(@CS, $2) && next;

			# General text is either files or options info, depending on the
			# value of the option flag
			$opt_flag ? push(@Options, $_) : push(@Files, $_);
        }
	}
	push(
		@submits,
		{
			"Options"              => [@Options],
			"Files"                => [@Files],
			"Comments"             => [@CS],
			"RelatedRecords"       => [@RR],
			"CodeReviewers"        => [@CR],
			"Mail sent to"         => [@Mailsent],
			"GeckLogin" 		   => $geckLogin,
			"NoSubmitFileFlag"     => $noSubmitFileFlag,
		}
	) if @Files;
	return \@submits;
	}
	else{
	my @noSubmitFileSubmits = ();
	$submit_file = 0; # two-line grammar didn't hold
			my $parsedData = parseWithoutSubmitFile(@rippedParagraphs);
			#submit file does not exist flag
			$noSubmitFileFlag = 1;
				push(
		@noSubmitFileSubmits,
		{
			"GeckLogin" 		   => $geckLogin,
			"ParsedData"		   => $parsedData,
			"NoSubmitFileFlag"     => $noSubmitFileFlag,
			"Cluster"              => $parsedData->{t},
			"JobID"                => $parsedData->{dollar_},
			"gLogFilesOption"      => exists $parsedData->{GLOGFILES},
			"gLogSbcheckOption"    => exists $parsedData->{GLOGSBCHECK},
		}
	) if $parsedData;
	return \@noSubmitFileSubmits;
	}
}


# we parse token differently if user makes the submission without submit file
sub parseWithoutSubmitFile (@) {
	my $arg_flag = 0;
	my $parsedData = {};
	my $current_option = '';
		while (my $line = shift @_) {
		if ($arg_flag == 1) {
		if ($line =~ /^Currently (\$\_=.*)/) {
		local $_;
		eval "$1;";
		$parsedData->{dollar_} = $_;
		$arg_flag = 0;
		}
		elsif ($line =~ /^\s+\-(.*)/) {
		$current_option = $1;
		$parsedData->{$current_option} = undef;
		next;
		}
		elsif ($current_option && $line =~ /^\s+(.*)/) {
		$parsedData->{$current_option} = $1;
		$current_option = undef;
		}
		}
		else {
		if ($line =~ /^Original arguments:/) {
		$arg_flag = 1;
		next;
		}
		}
	}
return $parsedData;
}

Open in new window



This is the file that I parse:

USER=testman, HOST=testman-deb6-64, ARCH=glnxa64
Revisions: /st/hub/share/apps/bat//share/mmit: 07/26-09:48:58; csubmitItem.pm: 2011/07/26-09:48:56
Original arguments:
        -t
        Atk
        -F
        20110914.submit
Currently $_='154551'

        main:/st/hub/share/apps/bat/bat2.15.17/share/../lib/csubmitCache.pm:44 called main::submissionHistory
        main:/st/hub/share/apps/bat/bat2.15.17/share/submit:3871 called main::CreateCacheFile

Current directory ($PWD) = /st/devel/sandbox/testman/Aslrtw
                Submit file
        ===========================
# Component        : Coder
# Sandbox location : /st/devel/sandbox/testman/Atk
# Submission for   : 2000
#
# Description:
#   Unlocking making changes
#
# Documentation impact:
#   None
#
# QE items:
#   None
#
# Type of change:
#   Unlocking making changes
#

# submit file for use with msubmit.  To use run the command
#      submit -F 24.submit
#   or use C-c C-c from emacs to run this command.
# "<a href='http://www-sandbox/testman/Atk/glnxa64'>/sandbox/testman/Atk_ests/glnxa64</a>"
# "No need for sbruntests: Interactive Tests Update"
Options:

-CJ "<a href='http://www-sandbox/testman/Atk_tests/glnxa64'>/sandbox/testman/Atk_tests/glnxa64</a>"
-nowrap
-subject "Unlocking making changes"
-KEYWORD1
-KEYWORD2

st/ert/variants/variants4.c
CR: testman2
RR: 123456
CS: Unlocking before making changes

Mail sent to:
    st.devel.submit: Unlocking making changes
    Files:
    st/ert/variants/variants5.c

	
				Submit file
        ===========================
# Component        : Coder
# Sandbox location : /st/devel/sandbox/testman/Atk
# Submission for   : 2000
#
# Description:
#   Unlocking making changes
#
# Documentation impact:
#   None
#
# QE items:
#   None
#
# Type of change:
#   Unlocking making changes
#

# submit file for use with msubmit.  To use run the command
#      submit -F 14.submit
#   or use C-c C-c from emacs to run this command.
# "<a href='http://www-sandbox/testman/Atk_tests/glnxa64'>/sandbox/testman/Atk_tests/glnxa64</a>"
# "No need for sbruntests: Interactive Tests Update"
Options:

-CJ "<a href='http://www-sandbox/testman/Atk_tests/glnxa64'>/sandbox/testman/Atk_tests/glnxa64</a>"
-nowrap
-subject "Unlocking making changes"
-KEYWORD1
-KEYWORD2

st/ert/variants/variants5.c
st/ert/variants/variants6.c
CR: testman2
RR: 333333, 444444
CS: Unlocking before making changes

st/ert/variants/variants7.c
CR: testman2
RR: 555555, 666666
CS: Unlocking before making changes

Mail sent to:
    st.devel.submit: Unlocking making changes
    Files:
    st/ert/variants/variants5.c

Open in new window



When I debug this code and do the following, I get the list of numbers for each group separately.

x @{$cache_data->[1]{RelatedRecords}};

Open in new window


The result is here. And this is what I expected.
0  "333333, 444444\cM\cJ"
1  "555555, 666666\cM\cJ"

Open in new window


However, when I do the following:
x @{$cache_data->[1]{Files}}

Open in new window


The result is:
0  "st/ert/variants/variants5.c\cM\cJ"
1  "st/ert/variants/variants6.c\cM\cJ"
2  "st/ert/variants/variants7.c\cM\cJ"

Open in new window


However, I expect to get the following

0  "st/ert/variants/variants5.c\cM\cJ"
     "st/ert/variants/variants6.c\cM\cJ"
1  "st/ert/variants/variants7.c\cM\cJ"

Open in new window


Because when you look at the text file that I parse,

these two are in one group:
"st/ert/variants/variants5.c\cM\cJ"
"st/ert/variants/variants6.c\cM\cJ"

Open in new window


and this one is in another group.

"st/ert/variants/variants7.c\cM\cJ"

Open in new window


What is the problem with this code and how can I fix this issue?

 If I cannot fix it the code will be entirely useless.

Can you please help me?

Thanks,
0
Tolgar
Asked:
Tolgar
  • 11
  • 6
1 Solution
 
parparovCommented:
Well, the problem is a bit ambiguous grammar.
How should we determine that a new section of submitted files ended and a new started?
You told me that CS marker is by no means mandatory end of that section.
0
 
TolgarAuthor Commented:
you are right, CS does not have to be at the end.

But, if the code detects all of the below, then it means we are done with one group.

CR (or Code Reviewer),
CS (or Comments)
RR (or Related Records)
 
And we can continue with the other group.

Does this help a bit?

Thanks,
 


0
 
parparovCommented:
Probably... Gotta think about it. Will post an update on Monday.
0
Concerto's Cloud Advisory Services

Want to avoid the missteps to gaining all the benefits of the cloud? Learn more about the different assessment options from our Cloud Advisory team.

 
TolgarAuthor Commented:
I see. Would it be possible to do it during tomorrow?

I am planning to complete the rest of it during Sunday.

Thanks
0
 
parparovCommented:
I can't promise... my weekends are usually loaded with family matters.
0
 
TolgarAuthor Commented:
I understand. I would be pleased if you can do your best.

Thanks,
0
 
TolgarAuthor Commented:
Hi,
I wonder if there is any progress.

Thanks,
0
 
parparovCommented:
If we discard the 'Mail sent to:' field, would that be ok?
0
 
TolgarAuthor Commented:
absolutely. I don't even need it. As far as I remember, we included "mail sent to" in order to separate it from other fields.

Thanks,
0
 
parparovCommented:
Well, consider this rework of my previous code (note the change in the data structures):
#!/usr/bin/perl

use strict;
use warnings;

our @HEADERS = ("GeckLogin", "Options", "FileInfo", "Mail sent to");
our @SUBHEADERS = ("Files", "Comments", "RelatedRecords", "CodeReviewers",);
# a prototype for convenience)
sub print_data1 ($);
sub print_data2 ($);
sub submitFileParser($);

my $data = submitFileParser(shift @ARGV);
my $geckLogin;
use Data::Dumper;
# A look at the data
print Dumper $data;

# Examples of accessing data
print_data1($data);
print "++++++++++++++++++++\n";
print_data2($data);
print "++++++++++++++++++++\n";

my @rr = @{$data->[0]{FileInfo}[0]{RelatedRecords}};
print Dumper \@rr;

sub print_data1 ($) {
	my $data = shift;

	for my $submit (@{$data}) {
		for my $header (@HEADERS) {
			print "$header:\n";
			if ($header eq 'GeckLogin') {
				print "$submit->{$header}\n";
			}
			elsif ($header eq 'FileInfo') {
				for my $subitem (@{$submit->{$header}}) {
					for my $subheader (@SUBHEADERS) {
						print "\t$subheader:\n";
						print @{$subitem->{$subheader}};
					}
				}
			}
			else {
				print @{$submit->{$header}};
			}
			print "\n";
		}
		print "\n";
	}
}

sub print_data2 ($) {
	my $data = shift;

	for my $header (@HEADERS) {
		if ($header eq 'GeckLogin') {
			print "GeckLogin: $data->[0]{GeckLogin}\n";
			next;
		}
		elsif ($header eq 'FileInfo') {
			for my $i (1..@{$data}) {
				for my $subheader (@SUBHEADERS) {
					print "\t$subheader:\n";
					for my $j (1..@{$data->[$i-1]{$header}}) {
						print "From submit file $i item $j\n";
						print @{$data->[$i-1]{$header}[$j-1]{$subheader}};
						print "\n";
					}
				}
			}
			next;
		}
		print "$header:\n";
		for my $i (1..@{$data}) {
			print "From submit file $i\n";
			print @{$data->[$i-1]{$header}};
			print "\n";
		}
		print "\n========\n";
	}
}

sub submitFileParser ($) {
	my $filename = shift;
	my @paragraphs;
#	local($/) = '';
	open( FILE, "< $filename" ) or die "Can't open $filename : $!";
	@paragraphs = <FILE>;
	close FILE;
	return read_paragraphs (@paragraphs);
}

sub read_paragraphs (@) {
	# read lines as parameters
	my @rippedParagraphs = @_;
	my @submits = ();
	# Storage for all sections
	# Temporary storages for single section of each type
	my (@Files, @CR, @RR, @CS, @Options, @Mailsent, @file_info);
	# Flags for file traversal logic
	my ($opt_flag, $file_flag, $mail_sent_to_flag);

	my $submit_file = 0;
	#read the file
	for ( @rippedParagraphs ) {
		if (/^USER=(\S+)\,/) {
			#obtain the login from USER=
			$geckLogin = $1;
		}
		if (/^\s*Submit\s+file\s*$/) {
			# We record the accumulated data:
			push(
				@submits,
				{
					"Options"              => [@Options],
					"FileInfo"             => [@file_info],
					"GeckLogin" 		   => $geckLogin,
					"Mail sent to"         => [@Mailsent],
				}
			) if @file_info;
			@Options = @Mailsent = @file_info = ();
			$submit_file = 1;
			next;
		}
		if ($submit_file == 1) {
			if (/^\s*\=+\s*$/) {
				$submit_file++;
				$mail_sent_to_flag = 0;
			} else {
				$submit_file = 0; # two-line grammar didn't hold
			}
			next;
		}
		if ($submit_file == 2) {
			if ($mail_sent_to_flag) {
				push(@Mailsent, $_);
				next;
			}
			if (m|^\#\s*Sandbox\s+location\s*\:\s*\S*/sandbox/(.*?)/|) {
				# Match the login name in the submit file - if it has not
				# already been done
				$geckLogin ||= $1;
			}
			# If we encounter a comment or empty string
			if (/^\#/ || !/\S/) {
				# we haven't encountered an option to start doing anything
				next unless $opt_flag || $file_flag;
				# If we're done with options, let's start reading file sections
				if ($opt_flag == 1) {
					$opt_flag = 0;
					$file_flag = 1;
				}
				elsif ($opt_flag > 1) {
					# Addresses the empty line within Options:
					$opt_flag--;
				}
				next;
			}
			if (/^Options/) {
				# We start reading options
				$opt_flag = 2;
				next;
			}
			if (/^Mail sent to/) {
				$mail_sent_to_flag = 1;
				push(@Mailsent, $_);
				next;
			}
			# Matching beginning of the line to determine the type of the string
			# and placing it in temporary storage
			/^R(R|elated\sRecords):\s*(.*\n)/ && push(@RR, $2) && goto CHECK;
			/^C(R|ode\sReviewer):\s*(.*\n)/ && push(@CR, $2) && goto CHECK;
			/^C(S|omments):\s*(.*\n)/ && push(@CS, $2) && goto CHECK;

			# General text is either files or options info, depending on the
			# value of the option flag
			$opt_flag ? push(@Options, $_) : push(@Files, $_);
			CHECK:
			if (@RR && @CR && @CS) {
				push(
					@file_info,
					{
						"Files"                => [@Files],
						"Comments"             => [@CS],
						"RelatedRecords"       => [@RR],
						"CodeReviewers"        => [@CR],
					},
				);
				@Files = @CS = @RR = @CR = ();
			}
        }
	}
	push(
		@submits,
		{
			"Options"              => [@Options],
			"Mail sent to"         => [@Mailsent],
			"FileInfo"             => [@file_info],
			"GeckLogin" 		   => $geckLogin,
		}
	) if @file_info;
	return \@submits;
}

Open in new window

0
 
TolgarAuthor Commented:
@parparov: for some reason this line now does not catch the submit file flag:

if (/^\s*Submit\s+file\s*$/) {

Open in new window


do you have any idea?

when I print the @rippedParagraphs I can see the intire file with Submit file flag. But When I print it only rerturns \c\\.

I am pretty sure that it was working last night.

Here is my little modified code:

sub read_paragraphs (@) {
	# read lines as parameters
	my @rippedParagraphs = @_;
	my @submits = ();
	# Storage for all sections
	# Temporary storages for single section of each type
	my (@Files, @CR, @RR, @CS, @Options, @Mailsent, @file_info);
	# Flags for file traversal logic
	my ($opt_flag, $file_flag, $mail_sent_to_flag);

	my $submit_file = 0;
	my $noSubmitFileFlag = 0;
	#read the file
	for ( @rippedParagraphs ) {
		if (/^USER=(\S+)\,/) {
			#obtain the login from USER=
			$geckLogin = $1;
		}
		if (/^\s*Submit\s+file\s*$/) {
			# We record the accumulated data:
			push(
				@submits,
				{
					"Options"              => [@Options],
					"FileInfo"             => [@file_info],
					"GeckLogin" 		   => $geckLogin,
					"NoSubmitFileFlag"     => $noSubmitFileFlag,
					"Mail sent to"         => [@Mailsent],
				}
			) if @file_info;
			@Options = @Mailsent = @file_info = ();
			$submit_file = 1;
			next;
		}
		else {
			my @noSubmitFileSubmits = ();
			$submit_file = 0; # two-line grammar didn't hold
			my $parsedData = parseWithoutSubmitFile(@rippedParagraphs);
			#submit file does not exist flag
			$noSubmitFileFlag = 1;
			push(
				@noSubmitFileSubmits,
				{
				"GeckLogin" 		   => $geckLogin,
				"ParsedData"		   => $parsedData,
				"NoSubmitFileFlag"     => $noSubmitFileFlag,
				"Cluster"              => $parsedData->{t},
				"JobID"                => $parsedData->{dollar_},
				"gLogFilesOption"      => exists $parsedData->{GLOGFILES},
				"gLogSbcheckOption"    => exists $parsedData->{GLOGSBCHECK},
				}
			) if $parsedData;
				return \@noSubmitFileSubmits;
		}
		if ($submit_file == 1) {
			if (/^\s*\=+\s*$/) {
				$submit_file++;
				$mail_sent_to_flag = 0;
			} 
			next;
		}
		if ($submit_file == 2) {
			if ($mail_sent_to_flag) {
				push(@Mailsent, $_);
				next;
			}
			#if (m|^\#\s*Sandbox\s+location\s*\:\s*\S*/sandbox/(.*?)/|) {
			#	# Match the login name in the submit file - if it has not
			#	# already been done
			#	$geckLogin ||= $1;
			#}
			# If we encounter a comment or empty string
			if (/^\#/ || !/\S/) {
				# we haven't encountered an option to start doing anything
				next unless $opt_flag || $file_flag;
				# If we're done with options, let's start reading file sections
				if ($opt_flag == 1) {
					$opt_flag = 0;
					$file_flag = 1;
				}
				elsif ($opt_flag > 1) {
					# Addresses the empty line within Options:
					$opt_flag--;
				}
				next;
			}
			if (/^Options/) {
				# We start reading options
				$opt_flag = 2;
				next;
			}
			if (/^Mail sent to/) {
				$mail_sent_to_flag = 1;
				push(@Mailsent, $_);
				next;
			}
			# Matching beginning of the line to determine the type of the string
			# and placing it in temporary storage
			/^R(R|elated\sRecords):\s*(.*\n)/ && push(@RR, $2) && goto CHECK;
			/^C(R|ode\sReviewer):\s*(.*\n)/ && push(@CR, $2) && goto CHECK;
			/^C(S|omments):\s*(.*\n)/ && push(@CS, $2) && goto CHECK;

			# General text is either files or options info, depending on the
			# value of the option flag
			$opt_flag ? push(@Options, $_) : push(@Files, $_);
			CHECK:
			if (@RR && @CR && @CS) {
				push(
					@file_info,
					{
						"Files"                => [@Files],
						"Comments"             => [@CS],
						"RelatedRecords"       => [@RR],
						"CodeReviewers"        => [@CR],
						"NoSubmitFileFlag"     => $noSubmitFileFlag,
					},
				);
				@Files = @CS = @RR = @CR = ();
			}
        }
	}
	push(
		@submits,
		{
			"Options"              => [@Options],
			"Mail sent to"         => [@Mailsent],
			"FileInfo"             => [@file_info],
			"GeckLogin" 		   => $geckLogin,
		}
	) if @file_info;
	return \@submits;
}

Open in new window

0
 
TolgarAuthor Commented:
@parparov: I think i found the problem. Please wait.

0
 
TolgarAuthor Commented:
@parparov: ok I fixed it.

But I found a limitation which comes from my initial definition of the problem.

If comments section has more than one lines then the second line is ignored.

So if comments have more than one line then, all these lines must be in one element of the comments array of the related group.

Can we fix this problem?

If this is a major change, I can create another question.

Thanks,
0
 
TolgarAuthor Commented:
@parparov: I created another question for this change.

ID: 27382844
0
 
TolgarAuthor Commented:
@parparov:

When I run this line in the code:

@Related_Records = @{$cache_data->[$i]{FileInfo}->[$j]{RelatedRecords}};

$RelatedRecordList = (join("\n", @Related_Records))."\n";

Open in new window


I get this:

DB<5> x $RelatedRecordList
0  "123456\cM\cJ\cJ"

Open in new window


Is there any way to get rid of these \CM\CJ\Cj kind of characters (line endings I guess) in general?


They are after every variable.

Thanks,


0
 
TolgarAuthor Commented:
@parparov: Any idea about the last two posts?

Thanks,
0
 
parparovCommented:
\cJ are the very "\n"s you're joining with.
\cM are \r chars you can get rid of if you want by adding
s/\r//g;

Open in new window

after
	for ( @rippedParagraphs ) {

Open in new window

0

Featured Post

Free Tool: ZipGrep

ZipGrep is a utility that can list and search zip (.war, .ear, .jar, etc) archives for text patterns, without the need to extract the archive's contents.

One of a set of tools we're offering as a way to say thank you for being a part of the community.

  • 11
  • 6
Tackle projects and never again get stuck behind a technical roadblock.
Join Now