Z_Beeblebrox
asked on
List Parsing
Hi,
I have this program that produces these text files which contains thousands of numbers, separated by new lines. The number are between 4 and 7 digits. Some of these files are upwards of 3 megs. Unfortunately most of the numbers are duplicated. What would be the most efficient way for me to parse these files and remove all of the duplicates?
Zaphod.
I have this program that produces these text files which contains thousands of numbers, separated by new lines. The number are between 4 and 7 digits. Some of these files are upwards of 3 megs. Unfortunately most of the numbers are duplicated. What would be the most efficient way for me to parse these files and remove all of the duplicates?
Zaphod.
Well...
One way that I have used in the past is to read the list into an array (or a database if the number of unique values is truly huge).
Thhen scan the array (db) for each new number you read, and add or discard it.
Using an SQL compliant db has a couple of other advantages - in that you can dump the list sorted/filtered by any number of criteria, whereas you have to perform the operations yourself on an array - but arrays/RAM is much faster if the sample set can fit into memory.
One way that I have used in the past is to read the list into an array (or a database if the number of unique values is truly huge).
Thhen scan the array (db) for each new number you read, and add or discard it.
Using an SQL compliant db has a couple of other advantages - in that you can dump the list sorted/filtered by any number of criteria, whereas you have to perform the operations yourself on an array - but arrays/RAM is much faster if the sample set can fit into memory.
VERSION 5.00
Object = "{831FDD16-0C5C-11D2-A9FC- 0000F8754D A1}#2.0#0" ; "mscomctl.ocx"
Begin VB.Form frmSortRand
Caption = "Form1"
ClientHeight = 5190
ClientLeft = 60
ClientTop = 345
ClientWidth = 5895
LinkTopic = "Form1"
ScaleHeight = 5190
ScaleWidth = 5895
StartUpPosition = 3 'Windows Default
Begin MSComctlLib.ListView lstOrder
Height = 4965
Left = 2175
TabIndex = 3
Top = 150
Width = 1740
_ExtentX = 3069
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin MSComctlLib.ListView lstRandom
Height = 4965
Left = 300
TabIndex = 2
Top = 150
Width = 1665
_ExtentX = 2937
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin VB.CommandButton cmdList
Caption = "List"
Height = 390
Left = 4050
TabIndex = 1
Top = 900
Width = 1665
End
Begin VB.CommandButton cmdGetRandom
Caption = "Get Random"
Height = 390
Left = 4050
TabIndex = 0
Top = 225
Width = 1665
End
End
Attribute VB_Name = "frmSortRand"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = True
Attribute VB_Exposed = False
Option Explicit
Const MAX_RAND = 1000
Private Sub cmdGetRandom_Click()
Dim nIndex As Integer, nRand As Integer
lstRandom.ListItems.Clear
For nIndex = 1 To MAX_RAND
lstRandom.ListItems.Add , "id=" & nIndex, CStr(Int((MAX_RAND * Rnd) + 1))
Next
End Sub
Private Sub cmdList_Click()
Dim aSorted(MAX_RAND) As Integer
Dim nIndex As Integer
For nIndex = 1 To lstRandom.ListItems.Count
aSorted(CInt(lstRandom.Lis tItems.Ite m(nIndex). Text)) = 1
Next
lstOrder.ListItems.Clear
For nIndex = 0 To MAX_RAND
If (aSorted(nIndex) = 1) Then
lstOrder.ListItems.Add , "id=" & nIndex, CStr(nIndex)
End If
Next
End Sub
Private Sub Form_Load()
lstRandom.ColumnHeaders.Ad d , , , lstRandom.Width - 265
lstOrder.ColumnHeaders.Add , , , lstOrder.Width - 265
End Sub
Object = "{831FDD16-0C5C-11D2-A9FC-
Begin VB.Form frmSortRand
Caption = "Form1"
ClientHeight = 5190
ClientLeft = 60
ClientTop = 345
ClientWidth = 5895
LinkTopic = "Form1"
ScaleHeight = 5190
ScaleWidth = 5895
StartUpPosition = 3 'Windows Default
Begin MSComctlLib.ListView lstOrder
Height = 4965
Left = 2175
TabIndex = 3
Top = 150
Width = 1740
_ExtentX = 3069
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin MSComctlLib.ListView lstRandom
Height = 4965
Left = 300
TabIndex = 2
Top = 150
Width = 1665
_ExtentX = 2937
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin VB.CommandButton cmdList
Caption = "List"
Height = 390
Left = 4050
TabIndex = 1
Top = 900
Width = 1665
End
Begin VB.CommandButton cmdGetRandom
Caption = "Get Random"
Height = 390
Left = 4050
TabIndex = 0
Top = 225
Width = 1665
End
End
Attribute VB_Name = "frmSortRand"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = True
Attribute VB_Exposed = False
Option Explicit
Const MAX_RAND = 1000
Private Sub cmdGetRandom_Click()
Dim nIndex As Integer, nRand As Integer
lstRandom.ListItems.Clear
For nIndex = 1 To MAX_RAND
lstRandom.ListItems.Add , "id=" & nIndex, CStr(Int((MAX_RAND * Rnd) + 1))
Next
End Sub
Private Sub cmdList_Click()
Dim aSorted(MAX_RAND) As Integer
Dim nIndex As Integer
For nIndex = 1 To lstRandom.ListItems.Count
aSorted(CInt(lstRandom.Lis
Next
lstOrder.ListItems.Clear
For nIndex = 0 To MAX_RAND
If (aSorted(nIndex) = 1) Then
lstOrder.ListItems.Add , "id=" & nIndex, CStr(nIndex)
End If
Next
End Sub
Private Sub Form_Load()
lstRandom.ColumnHeaders.Ad
lstOrder.ColumnHeaders.Add
End Sub
VERSION 5.00
Object = "{831FDD16-0C5C-11D2-A9FC- 0000F8754D A1}#2.0#0" ; "mscomctl.ocx"
Begin VB.Form frmSortRand
Caption = "Form1"
ClientHeight = 5190
ClientLeft = 60
ClientTop = 345
ClientWidth = 5895
LinkTopic = "Form1"
ScaleHeight = 5190
ScaleWidth = 5895
StartUpPosition = 3 'Windows Default
Begin MSComctlLib.ListView lstOrder
Height = 4965
Left = 2175
TabIndex = 3
Top = 150
Width = 1740
_ExtentX = 3069
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin MSComctlLib.ListView lstRandom
Height = 4965
Left = 300
TabIndex = 2
Top = 150
Width = 1665
_ExtentX = 2937
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin VB.CommandButton cmdList
Caption = "List"
Height = 390
Left = 4050
TabIndex = 1
Top = 900
Width = 1665
End
Begin VB.CommandButton cmdGetRandom
Caption = "Get Random"
Height = 390
Left = 4050
TabIndex = 0
Top = 225
Width = 1665
End
End
Attribute VB_Name = "frmSortRand"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = True
Attribute VB_Exposed = False
Option Explicit
Const MAX_RAND = 1000
Private Sub cmdGetRandom_Click()
Dim nIndex As Integer, nRand As Integer
lstRandom.ListItems.Clear
For nIndex = 1 To MAX_RAND
lstRandom.ListItems.Add , "id=" & nIndex, CStr(Int((MAX_RAND * Rnd) + 1))
Next
End Sub
Private Sub cmdList_Click()
Dim aSorted(MAX_RAND) As Integer
Dim nIndex As Integer
For nIndex = 1 To lstRandom.ListItems.Count
aSorted(CInt(lstRandom.Lis tItems.Ite m(nIndex). Text)) = 1
Next
lstOrder.ListItems.Clear
For nIndex = 0 To MAX_RAND
If (aSorted(nIndex) = 1) Then
lstOrder.ListItems.Add , "id=" & nIndex, CStr(nIndex)
End If
Next
End Sub
Private Sub Form_Load()
lstRandom.ColumnHeaders.Ad d , , , lstRandom.Width - 265
lstOrder.ColumnHeaders.Add , , , lstOrder.Width - 265
End Sub
Object = "{831FDD16-0C5C-11D2-A9FC-
Begin VB.Form frmSortRand
Caption = "Form1"
ClientHeight = 5190
ClientLeft = 60
ClientTop = 345
ClientWidth = 5895
LinkTopic = "Form1"
ScaleHeight = 5190
ScaleWidth = 5895
StartUpPosition = 3 'Windows Default
Begin MSComctlLib.ListView lstOrder
Height = 4965
Left = 2175
TabIndex = 3
Top = 150
Width = 1740
_ExtentX = 3069
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin MSComctlLib.ListView lstRandom
Height = 4965
Left = 300
TabIndex = 2
Top = 150
Width = 1665
_ExtentX = 2937
_ExtentY = 8758
View = 3
LabelWrap = -1 'True
HideSelection = -1 'True
FullRowSelect = -1 'True
_Version = 393217
ForeColor = -2147483640
BackColor = -2147483643
BorderStyle = 1
Appearance = 1
NumItems = 0
End
Begin VB.CommandButton cmdList
Caption = "List"
Height = 390
Left = 4050
TabIndex = 1
Top = 900
Width = 1665
End
Begin VB.CommandButton cmdGetRandom
Caption = "Get Random"
Height = 390
Left = 4050
TabIndex = 0
Top = 225
Width = 1665
End
End
Attribute VB_Name = "frmSortRand"
Attribute VB_GlobalNameSpace = False
Attribute VB_Creatable = False
Attribute VB_PredeclaredId = True
Attribute VB_Exposed = False
Option Explicit
Const MAX_RAND = 1000
Private Sub cmdGetRandom_Click()
Dim nIndex As Integer, nRand As Integer
lstRandom.ListItems.Clear
For nIndex = 1 To MAX_RAND
lstRandom.ListItems.Add , "id=" & nIndex, CStr(Int((MAX_RAND * Rnd) + 1))
Next
End Sub
Private Sub cmdList_Click()
Dim aSorted(MAX_RAND) As Integer
Dim nIndex As Integer
For nIndex = 1 To lstRandom.ListItems.Count
aSorted(CInt(lstRandom.Lis
Next
lstOrder.ListItems.Clear
For nIndex = 0 To MAX_RAND
If (aSorted(nIndex) = 1) Then
lstOrder.ListItems.Add , "id=" & nIndex, CStr(nIndex)
End If
Next
End Sub
Private Sub Form_Load()
lstRandom.ColumnHeaders.Ad
lstOrder.ColumnHeaders.Add
End Sub
ASKER CERTIFIED SOLUTION
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
ASKER
Hi,
Just so you guys don't think I am ignoring this question, so far I prefer hornet241's solution. In fact, its pretty impressive. But just in case there is a better way, I will leave this question open until tomorrow evening. If by then there is no better answer then you can have the points.
Zaphod.
Just so you guys don't think I am ignoring this question, so far I prefer hornet241's solution. In fact, its pretty impressive. But just in case there is a better way, I will leave this question open until tomorrow evening. If by then there is no better answer then you can have the points.
Zaphod.
Zaphod, another possible solution, this works equally well with strings and numbers, uses the collection object which allows you to directly test using a key the existence of an element:
Private colNumbers As Collection
Private Sub Command3_Click()
Dim lngNumber As Long
Dim lngCount As Long
Set colNumbers = New Collection
Do
lngCount = lngCount + 1
lngNumber = Rnd() * 10000
If Not NumberExists(lngNumber) Then
colNumbers.Add lngNumber, CStr(lngNumber)
End If
Label1.Caption = colNumbers.Count & " / " & lngCount
Label2.Caption = lngNumber
DoEvents
Loop Until lngCount = 10000000000000#
MsgBox colNumbers.Count
Set colNumbers = Nothing
End Sub
Private Function NumberExists(ByVal Number As Long)
On Error Resume Next
If colNumbers(CStr(Number)) <> Number Then
NumberExists = False
Else
NumberExists = True
End If
End Function
This example uses random numbers, it you run it (with the label controls to see what is going on) you can see that the count of numbers tested goes up and up but the count of elements goes up to the maximum slowly (relatively) and then just sits there.
Private colNumbers As Collection
Private Sub Command3_Click()
Dim lngNumber As Long
Dim lngCount As Long
Set colNumbers = New Collection
Do
lngCount = lngCount + 1
lngNumber = Rnd() * 10000
If Not NumberExists(lngNumber) Then
colNumbers.Add lngNumber, CStr(lngNumber)
End If
Label1.Caption = colNumbers.Count & " / " & lngCount
Label2.Caption = lngNumber
DoEvents
Loop Until lngCount = 10000000000000#
MsgBox colNumbers.Count
Set colNumbers = Nothing
End Sub
Private Function NumberExists(ByVal Number As Long)
On Error Resume Next
If colNumbers(CStr(Number)) <> Number Then
NumberExists = False
Else
NumberExists = True
End If
End Function
This example uses random numbers, it you run it (with the label controls to see what is going on) you can see that the count of numbers tested goes up and up but the count of elements goes up to the maximum slowly (relatively) and then just sits there.
accept hornets code, very well written even if i say so my self.
well done horn
cheers
Andy
well done horn
cheers
Andy
One way that I have used in the past is to read the list into an array (or a database if the number of unique values is truly huge).
Thhen scan the array (db) for each new number you read, and add or discard it.
Using an SQL compliant db has a couple of other advantages - in that you can dump the list sorted/filtered by any number of criteria, whereas you have to perform the operations yourself on an array - but arrays/RAM is much faster if the sample set can fit into memory.