Want to protect your cyber security and still get fast solutions? Ask a secure question today.Go Premium

x
  • Status: Solved
  • Priority: Medium
  • Security: Public
  • Views: 4060
  • Last Modified:

How to convert httpWebResponse into mshtml.HTMLDocument

Hi everybody,

I have something like

Private doc As New mshtml.HTMLDocument
Dim sStream As Stream
Dim URLReq As HttpWebRequest
Dim URLRes As HttpWebResponse
URLReq = WebRequest.Create(v_crawlURL)
URLRes = URLReq.GetResponse()
sStream = URLRes.GetResponseStream()
Dim sr As New StreamReader(sStream)
Dim cont As String = sr.ReadToEnd

now I have the content of the page in the cont variable but I'm stuck here. How do I get it into a mshtml.HTMLDocument object ?

Should not be that hard...


Thanks in advance

Regards
0
wildcard76
Asked:
wildcard76
  • 8
  • 4
1 Solution
 
Bob LearnedCommented:
It is more difficult that one would think.  Here is what I use:

Option Strict On

Imports System.Runtime.InteropServices

Public Class HtmlDocument

  Private _anchors As New ArrayList
  Private _images As New ArrayList

  <ComImport(), Guid("0000010c-0000-0000-C000-000000000046"), _
  InterfaceType(ComInterfaceType.InterfaceIsIUnknown)> _
  Interface IPersist
    Sub GetClassID(ByRef pClassId As Guid)
  End Interface

  <ComImport(), Guid("7FD52380-4E07-101B-AE2D-08002B2EC713"), _
  InterfaceType(ComInterfaceType.InterfaceIsIUnknown)> _
  Interface IPersistStreamInit : Inherits IPersist
    Shadows Sub GetClassID(ByRef pClassId As Guid)
    <PreserveSig()> _
    Function IsDirty() As Integer
    Sub Load(ByVal pStm As UCOMIStream)
    Sub Save(ByVal pStm As UCOMIStream, _
    <MarshalAs(UnmanagedType.Bool)> ByVal fClearDirty As Boolean)
    Sub GetMaxSize(ByRef pCbSize As Long)
    Sub InitNew()
  End Interface

  Private m_document As mshtml.HTMLDocument
  Private m_url As String = ""

  ''' -----------------------------------------------------------------------------
  ''' <summary>
  ''' Use a thread to start the document retrieval process--waiting until
  ''' the document is ready.
  ''' </summary>
  ''' <param name="url">The URL of the HTML document to get.</param>
  ''' -----------------------------------------------------------------------------
  Public Sub New(ByVal url As String)

    m_url = url

    Dim thread As New Threading.Thread(AddressOf StartGetDocument)
    thread.Start()

    While m_document Is Nothing OrElse m_document.readyState <> "complete"
      Application.DoEvents()
    End While

    Me.FindAnchors(m_document)

  End Sub

  '''  -----------------------------------------------------------------------------
  ''' <summary>
  ''' Background thread worker function that initializes an IPersistStreamInit for the
  ''' document retrieval.
  ''' </summary>
  ''' -----------------------------------------------------------------------------
  Private Sub StartGetDocument()

    Dim doc As New mshtml.HTMLDocument

    ' Initialize the stream to receive the document.  Without these lines
    ' you will get an 'Object not set' exception.
    Dim ips As IPersistStreamInit
    ips = DirectCast(doc, IPersistStreamInit)
    ips.InitNew()

    m_document = DirectCast(doc.createDocumentFromUrl(m_url, vbNullString), mshtml.HTMLDocument)

  End Sub

  Public Sub New(ByVal document As mshtml.HTMLDocument)

    Me.FindAnchors(document)

  End Sub

  ''' -----------------------------------------------------------------------------
  ''' <summary>
  ''' Find all the anchor (<a> tags) in an HTML document
  ''' </summary>
  ''' <param name="document">The document from either a Web Browser control or from a URL.</param>
  ''' -----------------------------------------------------------------------------
  Private Sub FindAnchors(ByVal document As mshtml.HTMLDocument)

    For Each element As mshtml.HTMLAnchorElementClass In document.getElementsByName("a")

      Dim anchor As New HtmlAnchor
      anchor.HRef = GetAttribute(element, "href")
      anchor.Class = GetAttribute(element, "class")
      anchor.Text = element.innerText

      _anchors.Add(anchor)
    Next element

  End Sub

  ''' -----------------------------------------------------------------------------
  ''' <summary>
  ''' Find all the image (<img> tags) in an HTML document
  ''' </summary>
  ''' <param name="document">The document from either a Web Browser control or from a URL.</param>
  ''' -----------------------------------------------------------------------------
  Private Sub FindImages(ByVal document As mshtml.HTMLDocument)

    For Each element As mshtml.HTMLImgClass In document.getElementsByName("img")

      Dim Image As New HtmlImage
      Image.Src = GetAttribute(element, "src")

      _images.Add(Image)
    Next element

  End Sub

  ''' -----------------------------------------------------------------------------
  ''' <summary>
  ''' Get the attribute value from an HTML element
  ''' </summary>
  ''' <param name="element">The element to get the value from</param>
  ''' <param name="attribName">The attribute name</param>
  ''' <returns>The attribute value, or empty string if not found</returns>
  ''' </history>
  ''' -----------------------------------------------------------------------------
  Private Function GetAttribute(ByVal element As mshtml.IHTMLElement, ByVal attribName As String) As String

    If Not element.getAttribute(attribName) Is Nothing Then
      Return element.getAttribute(attribName).ToString()
    End If

    Return ""
  End Function

  Public ReadOnly Property Anchors() As HtmlAnchor()
    Get
      Return DirectCast(_anchors.ToArray(GetType(HtmlAnchor)), HtmlAnchor())
    End Get
  End Property

  Public ReadOnly Property Images() As HtmlImage()
    Get
      Return DirectCast(_images.ToArray(GetType(HtmlImage)), HtmlImage())
    End Get
  End Property

End Class

Public Class HtmlAnchor
  Public HRef As String = ""
  Public [Class] As String = ""
  Public Text As String = ""
End Class

Public Class HtmlImage
  Public Src As String = ""
End Class

Bob

0
 
wildcard76Author Commented:
hi,

the class seems really solid and substantial
but when I try to do something like

        Dim ht As New HtmlDocument("http://www.google.com")
        Dim a As HtmlAnchor
        For Each a In ht.Anchors
            MessageBox.Show(a.HRef)
        Next

anchors property always returns empty regardless of the url i provide.

what may i be doing wrong ?

regards



0
 
wildcard76Author Commented:
actually images property is also empty...

raising points btw...

regards
0
Industry Leaders: We Want Your Opinion!

We value your feedback.

Take our survey and automatically be enter to win anyone of the following:
Yeti Cooler, Amazon eGift Card, and Movie eGift Card!

 
wildcard76Author Commented:
ok when I debug and examine the document after m_document = DirectCast(doc.createDocumentFromUrl(m_url, vbNullString), mshtml.HTMLDocument) is executed, i noticed the below errors...

      baseUrl      <error: an exception of type: {System.NotImplementedException} occurred>      String
      enableDownload      <error: an exception of type: {System.NotImplementedException} occurred>      Boolean
      frames      <error: an exception of type: {System.InvalidCastException} occurred>      mshtml.FramesCollection
      IHTMLDocument2_frames      <error: an exception of type: {System.InvalidCastException} occurred>      mshtml.FramesCollection
      IHTMLDocument2_location      <error: an exception of type: {System.InvalidCastException} occurred>      mshtml.HTMLLocation
      IHTMLDocument2_parentWindow      <error: an exception of type: {System.InvalidCastException} occurred>      mshtml.IHTMLWindow2
      IHTMLDocument2_Script      <error: an exception of type: {System.InvalidCastException} occurred>      Object

and several others...

      title      "Google"      String

is present as well which means the sub actually connects to the url and receives some data...

regards

0
 
Bob LearnedCommented:
.NET version?

Bob
0
 
wildcard76Author Commented:
1.1
0
 
wildcard76Author Commented:
1.1 4322 to be precise
0
 
Bob LearnedCommented:
What type of application are you running from?  WinForms?  ASP.NET?

Bob
0
 
wildcard76Author Commented:
it is a winforms application...
0
 
wildcard76Author Commented:
I've experienced something before, while I was using a axWebBrowser control on a form, when I used the navigate2 method on the control when the form is not visible, I had a cominvalidstate exception, which I corrected by simply showing the form before navigaitng... can it be something similar... because there's no visible controls etc... I know it's a far shot:)
0
 
wildcard76Author Commented:
found it...      

Dim oDoc As New mshtml.HTMLDocument
        Dim iDoc As mshtml.IHTMLDocument2 = oDoc

        'write to the IHTMLDocument2
        iDoc.write(cont)
        iDoc.close()

        'get it back to an HTMLDocument
        oDoc = iDoc

does the trick....

thanks for the help...
0
 
Bob LearnedCommented:
Ok, so next time I ask if you want to use WebBrowser control.  ;)  That was code that you could use (if it worked), that you only need a URL, and not any ActiveX control, running through interoperability.

Bob
0

Featured Post

Independent Software Vendors: We Want Your Opinion

We value your feedback.

Take our survey and automatically be enter to win anyone of the following:
Yeti Cooler, Amazon eGift Card, and Movie eGift Card!

  • 8
  • 4
Tackle projects and never again get stuck behind a technical roadblock.
Join Now