Link to home
Start Free TrialLog in
Avatar of fwsteal
fwsteal

asked on

parsing a complex text file

I need help parsing a complex text file.

example data:

"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 1",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"CLNRBUS                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName         **Total**","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"8424700                  ","RDJMdicl                   ","          1","        158","        169","        109","         60","          0","          0","          0"
"8425106                  ","             8          9        ","  7        ","  0        ","  7        ","  0        ","  0        ","  0",
"8425822                  ","Sh Gol, D                  ","          4","         75","         87","         79","          8","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 7",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"CLNRBUS                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"8426300                  ","L T Nish M               ","          0","         38","         41","         38","          3","          0","          0","          0"
"8426400                  ","McC and Ates          ","          0","         49","         46","         18","         28","          0","          0","          0"
"8426278                  ","Thi Str Fam H S   ","        413","         24","         24","         13","         11","          0","          0","          0"
"CLNRBUS **Total**      ","Subscriber Count              117","      9,548","      4,785","      4,519","      3,599","        920","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 956",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"WMDUTIL                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"7700100                  ","Prac sen tra done N   ","          2","          0","          0","          0","          0","          0","          0","          0"
"WMDUTIL **Total**      ","Subscriber Count                1","          2","          0","          0","          0","          0","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 742",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"PRFASP                ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"0718200                  ","Broooo Phaaaaaa 400              ","          2","          0","          0","          0","          0","          0","          0","          0"
"0710028                  ","Stoo And Shoo Phrrrrrr 155       ","          1","          0","          0","          0","          0","          0","          0","          0"
,
"PPI001-MSB               ","                                 ","    WbM P","PI         ","           ","           ","           ","          P","age 957",
"                         ","                        SCRIPT Me","ssage Tally"," Summary Bi","llable",
"GRAND TOTAL              ","                           Period",": 01/01/200","7 - 01/31/2","007",
"Level 1                  ","SName                  ","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"Grand Total              ","                                 ","    823,304","     52,476","     50,874","     38,922","     11,952","          0","          0","          0"
,

------------------------------------------------------------------------------------------------------------------------------------
1. Required Data from log files

"CLNRBUS                ","7 - 01/31/2","007",
"Level 1                  ","SName         **Total**","      NR","     REFREQ","     REFRES","    REFRESA","    REFRESD","      RCHG","     CHGRES","     RFILL"
"8424700                  ","RDJMdicl                   ","          1","        158","        169","        109","         60","          0","          0","          0"
"8425106                  ","             8          9        ","  7        ","  0        ","  7        ","  0        ","  0        ","  0",
"8425822                  ","Sh Gol, D                  ","          4","         75","         87","         79","          8","          0","          0","          0"
etc...

------------------------------------------------------------------
2. How the Data needs to be prepared for import

"CLNRBUS","01/31/2007","8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"CLNRBUS","01/31/2007","8425106","89","7","0","7","0","0","0","",""
"CLNRBUS","01/31/2007","8425822","Sh Gol, D","4","75","87","79","8","0","0","0"
"CLNRBUS","01/31/2007","8426300","L T Nish M","0","38","41","38","3","0","0","0"
"CLNRBUS","01/31/2007","8426400","McC and Ates","0","49","46","18","28","0","0","0"
"CLNRBUS","01/31/2007","8426278","Thi Str Fam H S","413","24","24","13","11","0","0","0"
"WMDUTIL","01/31/2007","7700100","Prac sen tra done N","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0718200","Broooo Phaaaaaa 400","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0710028","Stoo And Shoo Phrrrrrr 155","1","0","0","0","0","0","0","0"
Avatar of ddrudik
ddrudik
Flag of United States of America image

You were given quite the data to work with.  In your output column4 you collapsed the spaces in some rows ("89") and not others ("Sh Gol"), what are the rules that govern the formatting of the data for the output file?
ASKER CERTIFIED SOLUTION
Avatar of Mike Tomlinson
Mike Tomlinson
Flag of United States of America image

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
Avatar of fwsteal
fwsteal

ASKER

I was given a mess to work with.
Avatar of fwsteal

ASKER

Idle_Mind, I'm using vb.net 2005. I didn't know about that class and yes this text file is tricky.
I'll admit I wasn't aware of the class since I don't use VB.NET 2005, but fwsteal please give a little more info about how you want to arrive at your output file format.

In column4 you collapsed the spaces in some rows ("89") and not others ("Sh Gol"), what are the rules that govern the formatting of the data for the output file, in other words how should one know to collapse spaces between some characters and not others (collapse spaces between all numbers etc.)?
You can use that class in C# also, you just need the right reference.

Bob
Avatar of fwsteal

ASKER

Right, this is what 89 looked like:
"             8          9        "
I should have read: "8          9"
Output rules? There aren't any other than getting the data formatted and load into sql server. I guess that I'd want to trim the empty characters between the double quotes so it reads like the "8 9" above.

I'll create a console app to test the formatting.
Avatar of fwsteal

ASKER

I forgot to mention that this is a csv file. I use Visual Slick Edit to view the file.
Avatar of fwsteal

ASKER

Here is the code I used and unfortunately, the file is still the same:

Imports System
Imports System.Data
Imports System.Diagnostics
Imports System.Collections.Generic
Imports System.Globalization
Imports System.IO
Imports System.Text
Imports System.Text.RegularExpressions
Imports Microsoft.VisualBasic.FileIO

Module FileFormat
    Dim sSource As String = "vbnet_formatfile"
    Dim sLog As String = "Application"
    Dim sMachine As String = "."
    Dim ELog As New EventLog(sLog, sMachine, sSource)

    Sub Main()
        Dim currentFile As String = "C:\log\Prx-Import-2007-01.csv"

        sFormatFile(currentFile)
    End Sub

    Sub sFormatFile(ByVal currentFile As String)
        Dim tfp As New Microsoft.VisualBasic.FileIO.TextFieldParser(currentFile)

        tfp.TextFieldType = FileIO.FieldType.Delimited
        tfp.SetDelimiters(",")
        tfp.HasFieldsEnclosedInQuotes = True
        tfp.TrimWhiteSpace = True

        Dim fields() As String
        Try
            While Not tfp.EndOfData
                fields = tfp.ReadFields()
                Debug.Print(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))
            End While
            tfp.Close()
        Catch ex As Exception
            ELog.WriteEntry(ex.ToString(), EventLogEntryType.Error, 0, CType(4, Short))
        Finally
            GC.Collect()
        End Try
    End Sub

End Module
Well...

I pasted the data you posted above into NotePad and saved it as a text file.  The code I proposed cleaned it up quite a bit on my system.   =\

Can you try the same thing please?

This will tell us if there are other control characters in the original file that can't be seen by us.  If this is the case, you can post a smaller sample file at ee-stuff so we can work out a better solution:
http://www.ee-stuff.com/
Avatar of fwsteal

ASKER

Bob,
I did as you suggested and ran the module to format the file. I then compared the two files and they are the same.

I uploaded the file:

Your file has successfully been uploaded!
To download the file, you must be logged into EE-Stuff. Here are two pages that will display your file, if logged in:

View all files for Question ID: 22603067
https://filedb.experts-exchange.com/incoming/ee-stuff/3588-Prx-Import-2007-01.txt 

Direct link to your file
https://filedb.experts-exchange.com/incoming/ee-stuff/3588-Prx-Import-2007-01.txt 
Unfortunately I don't work with DB side stuff so I have no clue on your other question.  Testing with the file you uploaded...

By the way, my name is Mike.  =)

Bob, is TheLearnedOne: https://www.experts-exchange.com/M_38421.html
(he's a great coder...better than me!...don't worry, though, people swap us all the time!)
I ran my original code and this is what appeared in my "Immediate" window in my IDE:

"PPI001-MSB","","WbM P","PI","","","","P","age 1",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"CLNRBUS","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName         **Total**","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"8425106","8          9","7","0","7","0","0","0",""
"8425822","Sh Gol, D","4","75","87","79","8","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 7",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"CLNRBUS","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"8426300","L T Nish M","0","38","41","38","3","0","0","0"
"8426400","McC and Ates","0","49","46","18","28","0","0","0"
"8426278","Thi Str Fam H S","413","24","24","13","11","0","0","0"
"CLNRBUS **Total**","Subscriber Count              117","9,548","4,785","4,519","3,599","920","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 956",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"WMDUTIL","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"7700100","Prac sen tra done N","2","0","0","0","0","0","0","0"
"WMDUTIL **Total**","Subscriber Count                1","2","0","0","0","0","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 742",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"PRFASP","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"0718200","Broooo Phaaaaaa 400","2","0","0","0","0","0","0","0"
"0710028","Stoo And Shoo Phrrrrrr 155","1","0","0","0","0","0","0","0"
"",""
"PPI001-MSB","","WbM P","PI","","","","P","age 957",""
"","SCRIPT Me","ssage Tally","Summary Bi","llable",""
"GRAND TOTAL","Period",": 01/01/200","7 - 01/31/2","007",""
"Level 1","SName","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"
"Grand Total","","823,304","52,476","50,874","38,922","11,952","0","0","0"
"",""

It seems to be cleaning it up nicely on my system.  My code does NOT modify the original file at all.  I assumed you would want to process it more before writing it back out.  Do you need to see how to write the dataa back out?

Avatar of fwsteal

ASKER

Bob,
Yes, I need to process the file quite a bit to get to look like:

"CLNRBUS","01/31/2007","8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"CLNRBUS","01/31/2007","8425106","8          9","7","0","7","0","0","0","",""
"CLNRBUS","01/31/2007","8425822","Sh Gol, D","4","75","87","79","8","0","0","0"
"CLNRBUS","01/31/2007","8426300","L T Nish M","0","38","41","38","3","0","0","0"
"CLNRBUS","01/31/2007","8426400","McC and Ates","0","49","46","18","28","0","0","0"
"CLNRBUS","01/31/2007","8426278","Thi Str Fam H S","413","24","24","13","11","0","0","0"
"WMDUTIL","01/31/2007","7700100","Prac sen tra done N","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0718200","Broooo Phaaaaaa 400","2","0","0","0","0","0","0","0"
"PRFASP","01/31/2007","0710028","Stoo And Shoo Phrrrrrr 155","1","0","0","0","0","0","0","0"

Should double quotes and commas be used? I thought about using a single quote and a | to help separate the data for the import.

Avatar of fwsteal

ASKER

Bob, Okay, I got the same results in the output window is vs. It definitely removed some of the extra white space.
Avatar of fwsteal

ASKER

This helped me see the temp changes: Console.WriteLine(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))
Avatar of fwsteal

ASKER

I did this to write the changes to the same file:

    Sub Main()
        Dim currentFile As String = "C:\log\file.txt"
        Dim tempFile As String = "C:\log\file_temp.txt"

        sFormatFile(currentFile, tempFile)
    End Sub

    Sub sFormatFile(ByVal currentFile As String, ByVal tempFile As String)
        Dim tfp As New Microsoft.VisualBasic.FileIO.TextFieldParser(currentFile)
        Dim sw As New System.IO.StreamWriter(tempFile)

        tfp.TextFieldType = FileIO.FieldType.Delimited
        tfp.SetDelimiters(",")
        tfp.HasFieldsEnclosedInQuotes = True
        tfp.TrimWhiteSpace = True

        Dim fields() As String
        Try
            While Not tfp.EndOfData
                fields = tfp.ReadFields()
                sw.WriteLine(Chr(34) & String.Join(Chr(34) & "," & Chr(34), fields) & Chr(34))
            End While
            tfp.Close()
            sw.Close()
            File.Delete(currentFile)
            File.Move(tempFile, currentFile)
        Catch ex As Exception
            ELog.WriteEntry(ex.ToString(), EventLogEntryType.Error, 0, CType(4, Short))
        Finally
            GC.Collect()
        End Try
    End Sub
Avatar of fwsteal

ASKER

Here is how I was thinking of mapping the data:
GN            "CLNRBUS",
TDate            "01/31/2007",
Level1            "8424700",
SName            "RDJMdicl",
NR            "1",
ReFreq            "158",
ReFres            "169",
ReFresA            "109",
ReFresD            "60",
RChg            "0",
ChgRes            "0",
RFill            "0"
Avatar of fwsteal

ASKER

this is how I thought the process would work, of course I'm not sure how to do it.

1. delete top two lines
    "PPI001-MSB","","WbM P","PI","","","","P","age 1",""
    "","SCRIPT Me","ssage Tally","Summary Bi","llable",""

2. capture the GN and TDate from the 3rd line
    "CLNRBUS","Period",": 01/01/200","7 - 01/31/2","007",""

3. delete the 4th line
    "Level 1","SName         **Total**","NR","REFREQ","REFRES","REFRESA","REFRESD","RCHG","CHGRES","RFILL"

4. parse the following data lines:
"8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"8425106","8          9","7","0","7","0","0","0",""
"8425822","Sh Gol, D","4","75","87","79","8","0","0","0"

so they look like:

"CLNRBUS","01/31/2007","8424700","RDJMdicl","1","158","169","109","60","0","0","0"
"CLNRBUS","01/31/2007","8425106","8          9","7","0","7","0","0","0","",""
"CLNRBUS","01/31/2007","8425822","Sh Gol, D","4","75","87","79","8","0","0","0"

5. delete the separator line: "",""

6. Repeat the process until the end of the file is reached.
One little off-the-cuff comment, and I am getting all the credit--too bad Mike ;)

Bob
Avatar of fwsteal

ASKER

I'd think so.