Link to home
Start Free TrialLog in
Avatar of cjinsocal581
cjinsocal581Flag for United States of America

asked on

VB.NET Application hook to WebPage PART 2

Thanks to S-Twilley for the initial solution: https://www.experts-exchange.com/questions/21369134/VB-NET-Application-hook-to-WebPage.html

Now for part two:

I need the application to be running all the time. As suggested, I should use a timer and a flag for each existing or new IE window that is opened. Parse the pages, then continue to run dorment until new pages are opened.

Part 3 will be the need to capture the highlighted numbers and save them to a ListBox control.
Avatar of S-Twilley
S-Twilley

   Private SWs As New SHDocVw.ShellWindows
    Private IE As SHDocVw.InternetExplorer

    Private Sub HighlightFaxes()
        Dim doc
        Dim trueDoc As mshtml.IHTMLDocument2

        Try
            For Each IE In SWs
                Try
                    If IE.ReadyState = SHDocVw.tagREADYSTATE.READYSTATE_COMPLETE Then
                        doc = IE.Document

                        If TypeOf doc Is mshtml.IHTMLDocument2 Then
                            trueDoc = DirectCast(doc, mshtml.IHTMLDocument2)
                            If trueDoc.readyState = "complete" Then
                                If trueDoc.body.innerHTML.ToLower.IndexOf("<span id=faxparse></span>") >= 0 Then
                                    MsgBox("Already done this page")
                                Else
                                    Dim sinnerHTML As String = trueDoc.body.innerHTML

                                    Dim arrPatterns(5) As String ' increase this to hold the different patterns

                                    ' put these in priority order... this can probably be improved or made more efficient... will post a better one soon
                                    arrPatterns(0) = "(\+|)1 \(\d{3}\) \d{3}-\d{4}"   ' Matches +1 (800) 123-4567
                                    arrPatterns(1) = "(\+|)1-\d{3}-\d{3}-\d{4}"   ' Matches +1-800-123-4567
                                    arrPatterns(2) = "\(\d{3}\)\s?\d{3}-\d{4}"   ' Matches (800) 123-4567 or (800)123-4567
                                    arrPatterns(3) = "\d{3}-\d{3}-\d{4}"
                                    arrPatterns(4) = "\d{3} \d{3} \d{4}"
                                    arrPatterns(5) = "\d{3}\.\d{3}\.\d{4}"

                                    Dim sPattern As String = String.Join("|", arrPatterns)
                                    sPattern = "(?<fax>(" & sPattern & "))"

                                    Dim regReplace As New Regex(sPattern)
                                    Dim sMatch As Match

                                    For Each sMatch In regReplace.Matches(sinnerHTML)
                                        Dim thisFax As String = sMatch.Groups("fax").Value
                                        AddToList(thisFax)
                                    Next
                                    sinnerHTML = regReplace.Replace(sinnerHTML, "<b><font color=red>${fax}</font></b>")
                                    trueDoc.body.innerHTML = "<span id=faxparse></span>" & sinnerHTML  ' add a flag
                                End If
                            End If
                        End If
                    End If
                Catch e2 As Exception
                    'Looks like there was an error with this instance of IE, ignore it and go onto the next instance
                End Try
            Next
        Catch e1 As Exception
            'Don't do anything
        End Try
    End Sub
    Sub AddToList(ByVal FaxNum As String)
        FaxNum = FaxNum.Trim(" ")   ' remove surrounding spaces

        FaxNum = New Regex("\.|-|\)|\(|\+").Replace(FaxNum, " ")   'replace . - (   ) and +  with spaces
        FaxNum = New Regex("\s{2,}").Replace(FaxNum, " ")  ' replace double or more spaces with single spaces
        FaxNum = FaxNum.Trim(" ")  ' trim surrounding spaces
        FaxNum = FaxNum.Replace(" ", "-") ' replace spaces with -

        ' if not in list... add it
        Dim ind As Integer = ListBox1.FindStringExact(FaxNum)
        If ind >= 0 Then
            'do nothing
        Else
            ListBox1.Items.Add(FaxNum)
        End If
    End Sub
I haven't put in the timer part  yet...   this will mark a read page... but if you navigate away and come back, it will parse it again (pages get updated after all)... but there is a check so not to add duplicate entries in the list.

I'll give you code in a moment that will make it parse a page once it moves to a different location... so basically you get your program to "hook" onto an instance of IE... and it will then keep parsing every page it comes to  (which makes the <span> tag useless... but thats a temporary measure just so you can see the list addition part)
Avatar of cjinsocal581

ASKER

Keep in mind the overall goal is to jsut have the ability to provide the option to save the number.

In other words, it will recognize the numbers, highlight them, then provide a cursor or context menu over the highlighted number to allow to save it to the list.

Make sense? I just don't want to go off on the wrong track.
No problem... I just want to get the "basics" out of the way... at the moment... I have a form with a button which when clicked will hook onto all instances of IE... any navigation on these IE windows from then on will be monitored by the program and when a document is rendered (whether it be in a frame or not).. it will be parsed and highlighted... at the moment, it doesn't become a link for you to click on and add to your program, it just sends any found links back to the program to be added to a general list.
Cool deal.
ASKER CERTIFIED SOLUTION
Avatar of S-Twilley
S-Twilley

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
SOLUTION
Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
There is part that doesn't work properly...

I tried to add support for when an existing IE window which hasnt been hooked...

At the moment, when the program hooks onto it, it parses the top level document (if its complete)... but not any nested documents... i attempted it, but doesn't seem to work

if you navigate away and come back again though, it does trap nested documents and parses them but this is done in a different way
I am getting an error with the following:

AddToList(thisFax)

It is stating thisFax cannot be converted to string.
that line isn't in the latest version of my code
you can get rid of the HighlightFaxes method... or just comment it out
Ok.
I created a new solution and that solved it.

I see the highlighted numbers now.

Are we not loading them to the Listbox control yet?
Nevermind.

I see it now, had to reload it. (the solution)
even tho the code highlights both

1-800-456-4567

and

1 (800) 456 4567


It'll only add one item to the listbox since they're both the same number... least thats what is supposed to happen :P
Ok. Here is what I have so far.

I run the app. I click the button. I then open http://secureconditions.com/faxtest.asp. I then see the numbers hightlight red but nothing is added to the listbox.

I then open the same page: http://secureconditions.com/faxtest.asp again. Nothing happens until I click the button again and I get a MsgBox stating "Already done that page" with nothing being added to the ListBox.

I close all IE windows and close the app. I then open an IE page: http://secureconditions.com/faxtest.asp and then run the app. It appears I have to click the button each time for it to highlight the Numbers.

Is this where we need to be at this point?

By the way, excellent assistance on this one.
Now here is something interesting.

I place the HookOn() in the Load event and I run it. I then open http://secureconditions.com/faxtest.asp and the numbers are highlighted and the Listbox contains the numbers that are highlighted (of course not duplcating)

but when I open any other IE windows, it does not highlight them. for instance, this page: https://www.experts-exchange.com/questions/21382320/VB-NET-Application-hook-to-WebPage-PART-2.html

It does not hightlight the ones above.

I will try adding this to the page: 1 (800) 265-5555 and see what happens.
Ok. I left this page up and running. I then re-ran the app and it highlighted the number above. 1 (800) 265-5555
sorry for thinking out loud (here) but I am finding that we are getting closer.

By the way, it highlighted that number again and all the other numbers on this page.
ok... just thinkin about the "Already done that page"  bit....  question for you i guess....

that condition basically tells you when a page has already been formatted, so we don't need to format it again.

... but from what you just said, you might have re-run our application and the list is empty again, so we still want to retrieve the fax numbers... just not format them again, i'll post up some editted code in a moment
Actually, I want to concentrate on the formatting piece first.

Let me test the app running and open and close IE windows.

then, I will want to work on Part 3.
Ok. COOL!

We have the first piece down.

Here is what i did. I had an existing IE page open. (this one)

I ran the app and minimized it. I saw the numbers reformatted. WOOHOO

I opened up other web pages and saw the number formatted!! WOOHOO
ok... let me post up my latest version on this post tho... i changed the class so rather than doing RaiseEvents.. it has a delegate instead.... give me a couple of mins if you can!
no worries..
For the form:
=========================

    Dim myFaxCatcher As New IE_Wrapper.faxCatcher(AddressOf AddFaxNumber)
    Dim myWinCatcher As New IE_Wrapper.newWinCatcher(AddressOf AddToList)
    Dim myWinCloser As New IE_Wrapper.closeWinCatcher(AddressOf RemoveFromList)

    Private Sub HookOn()
        Try
            For Each IE In SWs
                Try
                    If IsListed(IE.HWND) Then
                        'do nothing
                    Else
                        Dim newWrapper As New IE_Wrapper(IE, myFaxCatcher, myWinCatcher, myWinCloser)
                        newWrapper.ParseDocuments()
                        AddToList(newWrapper)
                    End If
                Catch ex As Exception

                End Try
            Next
        Catch

        End Try
    End Sub

    Public Sub AddToList(ByRef backref As IE_Wrapper)
        Try
            WindowList.Add("WIN" & backref.HWND)
        Catch ex As Exception

        End Try
    End Sub

==========================

NEW CLASS

==========================

Public Class IE_Wrapper
    Implements IDisposable

    Public Delegate Sub faxCatcher(ByVal NewFax As String, ByVal URL As String)
    Public Delegate Sub newWinCatcher(ByRef backRef As IE_Wrapper)
    Public Delegate Sub closeWinCatcher(ByRef backRef As IE_Wrapper)

    Private catcher As faxCatcher
    Private creator As newWinCatcher
    Private closer As closeWinCatcher


    Private WithEvents IE_Inst As SHDocVw.InternetExplorer
    Private Valid As Boolean = True
    Private Declare Function IsWindow Lib "user32" Alias "IsWindow" (ByVal hwnd As Integer) As Boolean

    Public Sub New(ByVal NewInst As SHDocVw.InternetExplorer, ByVal thisCatcher As faxCatcher, ByVal thisNewWin As newWinCatcher, ByVal thisCloser As closeWinCatcher)
        Try
            IE_Inst = NewInst
            catcher = thisCatcher
            creator = thisNewWin
            closer = thisCloser

            Valid = True
        Catch ex As Exception
            Valid = False
        End Try
    End Sub

    Public ReadOnly Property IsValid() As Boolean
        Get
            Return Valid
        End Get
    End Property
    Public ReadOnly Property IsAlive() As Boolean
        Get
            Try
                Return IsWindow(IE_Inst.HWND)
            Catch ex As Exception
                Return False
            End Try
        End Get
    End Property
    Public ReadOnly Property HWND() As Integer
        Get
            Try
                Return IE_Inst.HWND
            Catch ex As Exception
                Return 0
            End Try
        End Get
    End Property

    Private Sub docComplete(ByVal pDisp As Object, ByRef URL As Object) Handles IE_Inst.DocumentComplete
        Try
            Dim thisDoc As mshtml.IHTMLDocument2
            thisDoc = pDisp.document
            highlightFax(thisDoc)
        Catch ex As Exception

        End Try
    End Sub

    Public Sub ParseDocuments()
        Try

            Dim thisDoc As mshtml.IHTMLDocument2

            thisDoc = IE_Inst.Document
            _parseDocument(thisDoc)
        Catch ex As Exception

        End Try
    End Sub

    Private Sub _parseDocument(ByVal thisDoc As mshtml.IHTMLDocument2)
        Try
            If thisDoc.readyState.ToLower = "complete" Then
                highlightFax(thisDoc)

                Dim framesColl As mshtml.IHTMLFramesCollection2 = thisDoc.frames
                Dim thisFrame As mshtml.IHTMLFrameBase2
                Dim thisFrameIndex As Integer

                For thisFrameIndex = 0 To framesColl.length - 1
                    thisFrame = framesColl.item(thisFrameIndex)
                    _parseDocument(thisFrame.contentWindow.document)
                Next

            End If
        Catch ex As Exception

        End Try
    End Sub

    Private Sub highlightFax(ByVal thisDoc As mshtml.IHTMLDocument2)
        Try
            Dim boolAlreadyFormatted As Boolean
            boolAlreadyFormatted = (thisDoc.body.innerHTML.ToLower.IndexOf("<span id=faxparse></span>") >= 0)
            Dim sinnerHTML As String = thisDoc.body.innerHTML

            Dim arrPatterns(3) As String ' increase this to hold the different pattern
            arrPatterns(0) = "(\+|)1\s?\(\d{3}\)\s?\d{3}(?<sep>(\.|\s|-))\d{4}"   ' Matches +1 (800) 123-4567
            arrPatterns(1) = "(\+|)1(?<sep>(\.|\s|-))\d{3}(?<sep>(\.|\s|-))\d{3}(?<sep>(\.|\s|-))\d{4}"   ' Matches +1-800-123-4567
            arrPatterns(2) = "\(\d{3}\)\s?\d{3}-\d{4}"   ' Matches (800) 123-4567 or (800)123-4567
            arrPatterns(3) = "\d{3}(?<sep>(\.|\s|-))\d{3}\k'sep'\d{4}"


            Dim sPattern As String = String.Join("|", arrPatterns)
            sPattern = "(?<fax>(" & sPattern & "))"

            Dim regReplace As New Regex(sPattern)
            Dim sMatch As Match

            For Each sMatch In regReplace.Matches(sinnerHTML)
                Dim thisFax As String = sMatch.Groups("fax").Value
                'RaiseEvent FaxNumberFound(thisFax, thisDoc.url)
                catcher.Invoke(thisFax, thisDoc.url)
            Next

            If Not boolAlreadyFormatted Then
                sinnerHTML = regReplace.Replace(sinnerHTML, "<b><font color=red>${fax}</font></b>")
                thisDoc.body.innerHTML = "<span id=faxparse></span>" & sinnerHTML  ' add a flag
            End If
        Catch ex As Exception
        End Try
    End Sub

    Private Sub newWin(ByRef ppDisp As Object, ByRef Cancel As Boolean, ByVal dwFlags As System.UInt32, ByVal bstrUrlContext As String, ByVal bstrUrl As String) Handles IE_Inst.NewWindow3
        Try
            Dim newIE As New SHDocVw.InternetExplorer
            ppDisp = newIE
            Dim newIE_Wrapper As New IE_Wrapper(ppDisp, catcher, creator, closer)
            creator.Invoke(newIE_Wrapper)
        Catch ex As Exception

        End Try
    End Sub
    Private Sub ieClosing() Handles IE_Inst.OnQuit
        closer.Invoke(Me)
    End Sub

    Public Sub Dispose() Implements System.IDisposable.Dispose
        Try
            IE_Inst = Nothing
        Catch ex As Exception

        End Try
    End Sub
End Class
What is WindowList?
same as before... sorry... the code above was just meant as an update... so replace the "like named" methods... as well as adding those few extra bits:

    Dim myFaxCatcher As New IE_Wrapper.faxCatcher(AddressOf AddFaxNumber)
    Dim myWinCatcher As New IE_Wrapper.newWinCatcher(AddressOf AddToList)
    Dim myWinCloser As New IE_Wrapper.closeWinCatcher(AddressOf RemoveFromList)

(replace the entire class as well)
I reverted back to the original code I accepted in this PAQ. I am getting tons of errors when I update the code with the new additions.
ok... I'll just post up all the code involved... start a new solution... add your test button and a listbox... get the mshtml reference and shdocw reference thing as well:

Imports System.Text.RegularExpressions

at the top of both form and class file:


=============================

FORM CODE:

    Private SWs As New SHDocVw.ShellWindows
    Private IE As SHDocVw.InternetExplorer

    Private Declare Function IsWindow Lib "user32" Alias "IsWindow" (ByVal hwnd As Integer) As Boolean
    Private WindowList As New Collection

    Dim myFaxCatcher As New IE_Wrapper.faxCatcher(AddressOf AddFaxNumber)
    Dim myWinCatcher As New IE_Wrapper.newWinCatcher(AddressOf AddToList)
    Dim myWinCloser As New IE_Wrapper.closeWinCatcher(AddressOf RemoveFromList)

    Private Sub CleanOutList()
        Dim win As IE_Wrapper
        Dim newList As New Collection

        For Each win In WindowList
            If win.IsAlive Then
                newList.Add(win, "WIN" & win.HWND)
            Else
                'do nothing
            End If
        Next
        WindowList = newList
    End Sub
    Private Function IsListed(ByVal CheckHWND As Integer) As Boolean
        Try
            Dim xHWND As IE_Wrapper = WindowList("WIN" & CheckHWND)
            Return True
        Catch ex As Exception
            Return False
        End Try

    End Function

    Private Sub HookOn()
        Try
            For Each IE In SWs
                Try
                    If IsListed(IE.HWND) Then
                        'do nothing
                    Else
                        Dim newWrapper As New IE_Wrapper(IE, myFaxCatcher, myWinCatcher, myWinCloser)
                        newWrapper.ParseDocuments()
                        AddToList(newWrapper)
                    End If
                Catch ex As Exception

                End Try
            Next
        Catch

        End Try
    End Sub

    Sub AddFaxNumber(ByVal FaxNum As String, ByVal sourceURL As String)
        FaxNum = FaxNum.Trim(" ")
        FaxNum = New Regex("\.|-|\)|\(|\+").Replace(FaxNum, " ")
        FaxNum = New Regex("\s{2,}").Replace(FaxNum, " ")
        FaxNum = FaxNum.Trim(" ")
        FaxNum = FaxNum.Replace(" ", "-")

        Dim ind As Integer = ListBox1.FindStringExact(FaxNum)
        If ind >= 0 Then
            'do nothing
        Else
            ListBox1.Items.Add(FaxNum)
        End If
    End Sub

    Sub RemoveFromList(ByRef backRef As IE_Wrapper)
        Try
            WindowList.Remove("WIN" & backRef.HWND)
        Catch ex As Exception

        End Try
    End Sub

    Public Sub AddToList(ByRef backref As IE_Wrapper)
        Try
            WindowList.Add("WIN" & backref.HWND)
        Catch ex As Exception

        End Try
    End Sub

    Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
        HookOn()
    End Sub


=======================================

CLASS CODE:

Public Class IE_Wrapper
    Implements IDisposable

    Public Delegate Sub faxCatcher(ByVal NewFax As String, ByVal URL As String)
    Public Delegate Sub newWinCatcher(ByRef backRef As IE_Wrapper)
    Public Delegate Sub closeWinCatcher(ByRef backRef As IE_Wrapper)

    Private catcher As faxCatcher
    Private creator As newWinCatcher
    Private closer As closeWinCatcher


    Private WithEvents IE_Inst As SHDocVw.InternetExplorer
    Private Valid As Boolean = True
    Private Declare Function IsWindow Lib "user32" Alias "IsWindow" (ByVal hwnd As Integer) As Boolean

    Public Sub New(ByVal NewInst As SHDocVw.InternetExplorer, ByVal thisCatcher As faxCatcher, ByVal thisNewWin As newWinCatcher, ByVal thisCloser As closeWinCatcher)
        Try
            IE_Inst = NewInst
            catcher = thisCatcher
            creator = thisNewWin
            closer = thisCloser

            Valid = True
        Catch ex As Exception
            Valid = False
        End Try
    End Sub

    Public ReadOnly Property IsValid() As Boolean
        Get
            Return Valid
        End Get
    End Property
    Public ReadOnly Property IsAlive() As Boolean
        Get
            Try
                Return IsWindow(IE_Inst.HWND)
            Catch ex As Exception
                Return False
            End Try
        End Get
    End Property
    Public ReadOnly Property HWND() As Integer
        Get
            Try
                Return IE_Inst.HWND
            Catch ex As Exception
                Return 0
            End Try
        End Get
    End Property

    Private Sub docComplete(ByVal pDisp As Object, ByRef URL As Object) Handles IE_Inst.DocumentComplete
        Try
            Dim thisDoc As mshtml.IHTMLDocument2
            thisDoc = pDisp.document
            highlightFax(thisDoc)
        Catch ex As Exception

        End Try
    End Sub

    Public Sub ParseDocuments()
        Try

            Dim thisDoc As mshtml.IHTMLDocument2

            thisDoc = IE_Inst.Document
            _parseDocument(thisDoc)
        Catch ex As Exception

        End Try
    End Sub

    Private Sub _parseDocument(ByVal thisDoc As mshtml.IHTMLDocument2)
        Try
            If thisDoc.readyState.ToLower = "complete" Then
                highlightFax(thisDoc)

                Dim framesColl As mshtml.IHTMLFramesCollection2 = thisDoc.frames
                Dim thisFrame As mshtml.IHTMLFrameBase2
                Dim thisFrameIndex As Integer

                For thisFrameIndex = 0 To framesColl.length - 1
                    thisFrame = framesColl.item(thisFrameIndex)
                    _parseDocument(thisFrame.contentWindow.document)
                Next

            End If
        Catch ex As Exception

        End Try
    End Sub

    Private Sub highlightFax(ByVal thisDoc As mshtml.IHTMLDocument2)
        Try
            Dim boolAlreadyFormatted As Boolean
            boolAlreadyFormatted = (thisDoc.body.innerHTML.ToLower.IndexOf("<span id=faxparse></span>") >= 0)
            Dim sinnerHTML As String = thisDoc.body.innerHTML

            Dim arrPatterns(3) As String ' increase this to hold the different pattern
            arrPatterns(0) = "(\+|)1\s?\(\d{3}\)\s?\d{3}(?<sep>(\.|\s|-))\d{4}"   ' Matches +1 (800) 123-4567
            arrPatterns(1) = "(\+|)1(?<sep>(\.|\s|-))\d{3}(?<sep>(\.|\s|-))\d{3}(?<sep>(\.|\s|-))\d{4}"   ' Matches +1-800-123-4567
            arrPatterns(2) = "\(\d{3}\)\s?\d{3}-\d{4}"   ' Matches (800) 123-4567 or (800)123-4567
            arrPatterns(3) = "\d{3}(?<sep>(\.|\s|-))\d{3}\k'sep'\d{4}"


            Dim sPattern As String = String.Join("|", arrPatterns)
            sPattern = "(?<fax>(" & sPattern & "))"

            Dim regReplace As New Regex(sPattern)
            Dim sMatch As Match

            For Each sMatch In regReplace.Matches(sinnerHTML)
                Dim thisFax As String = sMatch.Groups("fax").Value
                'RaiseEvent FaxNumberFound(thisFax, thisDoc.url)
                catcher.Invoke(thisFax, thisDoc.url)
            Next

            If Not boolAlreadyFormatted Then
                sinnerHTML = regReplace.Replace(sinnerHTML, "<a href='${fax}' name='faxcatchlink' target='_blank'><b><font color=red>${fax}</font></b></a>")
                thisDoc.body.innerHTML = "<span id=faxparse></span>" & sinnerHTML  ' add a flag
                Dim theselinks = thisDoc.all.tags("a")
                Dim thisLink As mshtml.IHTMLAnchorElement


            End If

        Catch ex As Exception
        End Try
    End Sub

    Private Sub newWin(ByRef ppDisp As Object, ByRef Cancel As Boolean, ByVal dwFlags As System.UInt32, ByVal bstrUrlContext As String, ByVal bstrUrl As String) Handles IE_Inst.NewWindow3
        Try
            Dim newIE As New SHDocVw.InternetExplorer
            ppDisp = newIE
            Dim newIE_Wrapper As New IE_Wrapper(ppDisp, catcher, creator, closer)
            creator.Invoke(newIE_Wrapper)
        Catch ex As Exception

        End Try
    End Sub
    Private Sub ieClosing() Handles IE_Inst.OnQuit
        closer.Invoke(Me)
    End Sub

    Public Sub Dispose() Implements System.IDisposable.Dispose
        Try
            IE_Inst = Nothing
        Catch ex As Exception

        End Try
    End Sub
End Class
888 625 5656