parsing a complex text file

fwsteal
fwsteal used Ask the Experts™
on
I need help parsing a complex text file.

example data:

"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 1",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"CLNRBUS                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName         **Total**","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"8424700                  ","RDJMdicl                   ","          1","        158","        169","        109","         60","          0","          0","          0"
"8425106                  ","             8          9        ","  7        ","  0        ","  7        ","  0        ","  0        ","  0",
"8425822                  ","Sh Gol, D                  ","          4","         75","         87","         79","          8","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 7",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"CLNRBUS                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"8426300                  ","L T Nish M               ","          0","         38","         41","         38","          3","          0","          0","          0"
"8426400                  ","McC and Ates          ","          0","         49","         46","         18","         28","          0","          0","          0"
"8426278                  ","Thi Str Fam H S   ","        413","         24","         24","         13","         11","          0","          0","          0"
"CLNRBUS **Total**      ","Subscriber Count              117","      9,548","      4,785","      4,519","      3,599","        920","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 956",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"WMDUTIL                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"7700100                  ","Prac sen tra done N   ","          2","          0","          0","          0","          0","          0","          0","          0"
"WMDUTIL **Total**      ","Subscriber Count                1","          2","          0","          0","          0","          0","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 742",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"PRFASP                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"0718200                  ","Broooo Phaaaaaa 400              ","          2","          0","          0","          0","          0","          0","          0","          0"
"0710028                  ","Stoo And Shoo Phrrrrrr 155       ","          1","          0","          0","          0","          0","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 957",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"GRAND TOTAL              ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"Grand Total              ","                                 ","    823,304","     52,476","     50,874","     38,922","     11,952","          0","          0","          0"
,

------------------------------------------------------------------------------------------------------------------------------------
1. Required Data from log files

"CLNRBUS                ","7 - 01/31/2","007",
"Level 1                  ","SName         **Total**","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"8424700                  ","RDJMdicl                   ","          1","        158","        169","        109","         60","          0","          0","          0"
"8425106                  ","             8          9        ","  7        ","  0        ","  7        ","  0        ","  0        ","  0",
"8425822                  ","Sh Gol, D                  ","          4","         75","         87","         79","          8","          0","          0","          0"
etc...

------------------------------------------------------------------
2. How the Data needs to be prepared for import

"CLNRBUS","01/31/2007","8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"CLNRBUS","01/31/2007","8425106","89","7","0","7","0","0","0","",""
"CLNRBUS","01/31/2007","8425822","Sh Gol, D","4","75","87","79","8","0","0","0"
"CLNRBUS","01/31/2007","8426300","L T Nish M","0","38","41","38","3","0","0","0"
"CLNRBUS","01/31/2007","8426400","McC and Ates","0","49","46","18","28","0","0","0"
"CLNRBUS","01/31/2007","8426278","Thi Str Fam H S","413","24","24","13","11","0","0","0"
"WMDUTIL","01/31/2007","7700100","Prac sen tra done N","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0718200","Broooo Phaaaaaa 400","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0710028","Stoo And Shoo Phrrrrrr 155","1","0","0","0","0","0","0","0"
Comment
Watch Question

Do more with

Expert Office
EXPERT OFFICE® is a registered trademark of EXPERTS EXCHANGE®
You were given quite the data to work with.  In your output column4 you collapsed the spaces in some rows ("89") and not others ("Sh Gol"), what are the rules that govern the formatting of the data for the output file?
High School Computer Science, Computer Applications, Digital Design, and Mathematics Teacher
Top Expert 2009
Commented:
If you have VB.Net 2005, then use the TextFieldParser() class.

See how this little piece of code handles the data:

        Dim tfp As New Microsoft.VisualBasic.FileIO.TextFieldParser("c:\someFile.txt")
        tfp.TextFieldType = FileIO.FieldType.Delimited
        tfp.SetDelimiters(",")
        tfp.HasFieldsEnclosedInQuotes = True
        tfp.TrimWhiteSpace = True

        Dim fields() As String
        While Not tfp.EndOfData
            fields = tfp.ReadFields()
            Debug.Print(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))
        End While
        tfp.Close()

It might be a good place to start.  You can rearrange/reformat the data in the "fields" array before you call String.Join() to put it back together.

The TextFieldParser() is a good choice because it can automatically handle fields with embedded delimeters such as this sample in your data:

    "8425822                  ","Sh Gol, D                  ", ...

The "Sh Gol, D" makes it trickier to parse manually with a simple String.Split() call.

Author

Commented:
I was given a mess to work with.
Angular Fundamentals

Learn the fundamentals of Angular 2, a JavaScript framework for developing dynamic single page applications.

Author

Commented:
Idle_Mind, I'm using vb.net 2005. I didn't know about that class and yes this text file is tricky.
I'll admit I wasn't aware of the class since I don't use VB.NET 2005, but fwsteal please give a little more info about how you want to arrive at your output file format.

In column4 you collapsed the spaces in some rows ("89") and not others ("Sh Gol"), what are the rules that govern the formatting of the data for the output file, in other words how should one know to collapse spaces between some characters and not others (collapse spaces between all numbers etc.)?
Most Valuable Expert 2012
Top Expert 2008

Commented:
You can use that class in C# also, you just need the right reference.

Bob

Author

Commented:
Right, this is what 89 looked like:
"             8          9        "
I should have read: "8          9"
Output rules? There aren't any other than getting the data formatted and load into sql server. I guess that I'd want to trim the empty characters between the double quotes so it reads like the "8 9" above.

I'll create a console app to test the formatting.

Author

Commented:
I forgot to mention that this is a csv file. I use Visual Slick Edit to view the file.

Author

Commented:
Here is the code I used and unfortunately, the file is still the same:

Imports System
Imports System.Data
Imports System.Diagnostics
Imports System.Collections.Generic
Imports System.Globalization
Imports System.IO
Imports System.Text
Imports System.Text.RegularExpressions
Imports Microsoft.VisualBasic.FileIO

Module FileFormat
    Dim sSource As String = "vbnet_formatfile"
    Dim sLog As String = "Application"
    Dim sMachine As String = "."
    Dim ELog As New EventLog(sLog, sMachine, sSource)

    Sub Main()
        Dim currentFile As String = "C:\log\Prx-Import-2007-01.csv"

        sFormatFile(currentFile)
    End Sub

    Sub sFormatFile(ByVal currentFile As String)
        Dim tfp As New Microsoft.VisualBasic.FileIO.TextFieldParser(currentFile)

        tfp.TextFieldType = FileIO.FieldType.Delimited
        tfp.SetDelimiters(",")
        tfp.HasFieldsEnclosedInQuotes = True
        tfp.TrimWhiteSpace = True

        Dim fields() As String
        Try
            While Not tfp.EndOfData
                fields = tfp.ReadFields()
                Debug.Print(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))
            End While
            tfp.Close()
        Catch ex As Exception
            ELog.WriteEntry(ex.ToString(), EventLogEntryType.Error, 0, CType(4, Short))
        Finally
            GC.Collect()
        End Try
    End Sub

End Module
Mike TomlinsonHigh School Computer Science, Computer Applications, Digital Design, and Mathematics Teacher
Top Expert 2009

Commented:
Well...

I pasted the data you posted above into NotePad and saved it as a text file.  The code I proposed cleaned it up quite a bit on my system.   =\

Can you try the same thing please?

This will tell us if there are other control characters in the original file that can't be seen by us.  If this is the case, you can post a smaller sample file at ee-stuff so we can work out a better solution:
http://www.ee-stuff.com/

Author

Commented:
Bob,
I did as you suggested and ran the module to format the file. I then compared the two files and they are the same.

I uploaded the file:

Your file has successfully been uploaded!
To download the file, you must be logged into EE-Stuff. Here are two pages that will display your file, if logged in:

View all files for Question ID: 22603067
https://filedb.experts-exchange.com/incoming/ee-stuff/3588-Prx-Import-2007-01.txt 

Direct link to your file
https://filedb.experts-exchange.com/incoming/ee-stuff/3588-Prx-Import-2007-01.txt 
Mike TomlinsonHigh School Computer Science, Computer Applications, Digital Design, and Mathematics Teacher
Top Expert 2009

Commented:
Unfortunately I don't work with DB side stuff so I have no clue on your other question.  Testing with the file you uploaded...

By the way, my name is Mike.  =)

Bob, is TheLearnedOne: http://www.experts-exchange.com/M_38421.html
(he's a great coder...better than me!...don't worry, though, people swap us all the time!)
Mike TomlinsonHigh School Computer Science, Computer Applications, Digital Design, and Mathematics Teacher
Top Expert 2009

Commented:
I ran my original code and this is what appeared in my "Immediate" window in my IDE:

"PPI001-MSB","","WbM P","PI","","","","P","age 1",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"CLNRBUS","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName         **Total**","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"8425106","8          9","7","0","7","0","0","0",""
"8425822","Sh Gol, D","4","75","87","79","8","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 7",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"CLNRBUS","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"8426300","L T Nish M","0","38","41","38","3","0","0","0"
"8426400","McC and Ates","0","49","46","18","28","0","0","0"
"8426278","Thi Str Fam H S","413","24","24","13","11","0","0","0"
"CLNRBUS **Total**","Subscriber Count              117","9,548","4,785","4,519","3,599","920","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 956",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"WMDUTIL","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"7700100","Prac sen tra done N","2","0","0","0","0","0","0","0"
"WMDUTIL **Total**","Subscriber Count                1","2","0","0","0","0","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 742",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"PRFASP","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"0718200","Broooo Phaaaaaa 400","2","0","0","0","0","0","0","0"
"0710028","Stoo And Shoo Phrrrrrr 155","1","0","0","0","0","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 957",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"GRAND TOTAL","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"Grand Total","","823,304","52,476","50,874","38,922","11,952","0","0","0"
"",""

It seems to be cleaning it up nicely on my system.  My code does NOT modify the original file at all.  I assumed you would want to process it more before writing it back out.  Do you need to see how to write the dataa back out?

Author

Commented:
Bob,
Yes, I need to process the file quite a bit to get to look like:

"CLNRBUS","01/31/2007","8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"CLNRBUS","01/31/2007","8425106","8          9","7","0","7","0","0","0","",""
"CLNRBUS","01/31/2007","8425822","Sh Gol, D","4","75","87","79","8","0","0","0"
"CLNRBUS","01/31/2007","8426300","L T Nish M","0","38","41","38","3","0","0","0"
"CLNRBUS","01/31/2007","8426400","McC and Ates","0","49","46","18","28","0","0","0"
"CLNRBUS","01/31/2007","8426278","Thi Str Fam H S","413","24","24","13","11","0","0","0"
"WMDUTIL","01/31/2007","7700100","Prac sen tra done N","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0718200","Broooo Phaaaaaa 400","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0710028","Stoo And Shoo Phrrrrrr 155","1","0","0","0","0","0","0","0"

Should double quotes and commas be used? I thought about using a single quote and a | to help separate the data for the import.

Author

Commented:
Bob, Okay, I got the same results in the output window is vs. It definitely removed some of the extra white space.

Author

Commented:
This helped me see the temp changes: Console.WriteLine(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))

Author

Commented:
I did this to write the changes to the same file:

    Sub Main()
        Dim currentFile As String = "C:\log\file.txt"
        Dim tempFile As String = "C:\log\file_temp.txt"

        sFormatFile(currentFile, tempFile)
    End Sub

    Sub sFormatFile(ByVal currentFile As String, ByVal tempFile As String)
        Dim tfp As New Microsoft.VisualBasic.FileIO.TextFieldParser(currentFile)
        Dim sw As New System.IO.StreamWriter(tempFile)

        tfp.TextFieldType = FileIO.FieldType.Delimited
        tfp.SetDelimiters(",")
        tfp.HasFieldsEnclosedInQuotes = True
        tfp.TrimWhiteSpace = True

        Dim fields() As String
        Try
            While Not tfp.EndOfData
                fields = tfp.ReadFields()
                sw.WriteLine(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))
            End While
            tfp.Close()
            sw.Close()
            File.Delete(currentFile)
            File.Move(tempFile, currentFile)
        Catch ex As Exception
            ELog.WriteEntry(ex.ToString(), EventLogEntryType.Error, 0, CType(4, Short))
        Finally
            GC.Collect()
        End Try
    End Sub

Author

Commented:
Here is how I was thinking of mapping the data:
GN            "CLNRBUS",
TDate            "01/31/2007",
Level1            "8424700",
SName            "RDJMdicl",
NR            "1",
ReFreq            "158",
ReFres            "169",
ReFresA            "109",
ReFresD            "60",
RChg            "0",
ChgRes            "0",
RFill            "0"

Author

Commented:
this is how I thought the process would work, of course I'm not sure how to do it.

1. delete top two lines
    "PPI001-MSB","","WbM P","PI","","","","P","age 1",""
    "","SCRIPT Me","ssage Tally","Summary Bi","llable",""

2. capture the GN and TDate from the 3rd line
    "CLNRBUS","Period",": 01/01/200","7 - 01/31/2","007",""

3. delete the 4th line
    "Level 1","SName         **Total**","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"

4. parse the following data lines:
"8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"8425106","8          9","7","0","7","0","0","0",""
"8425822","Sh Gol, D","4","75","87","79","8","0","0","0"

so they look like:

"CLNRBUS","01/31/2007","8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"CLNRBUS","01/31/2007","8425106","8          9","7","0","7","0","0","0","",""
"CLNRBUS","01/31/2007","8425822","Sh Gol, D","4","75","87","79","8","0","0","0"

5. delete the separator line: "",""

6. Repeat the process until the end of the file is reached.
Most Valuable Expert 2012
Top Expert 2008

Commented:
One little off-the-cuff comment, and I am getting all the credit--too bad Mike ;)

Bob

Author

Commented:
I'd think so.

Do more with

Expert Office
Submit tech questions to Ask the Experts™ at any time to receive solutions, advice, and new ideas from leading industry professionals.

Start 7-Day Free Trial