Can any .net guru help in writing post parameters for screen scrapping a website

mmalik15
mmalik15 used Ask the Experts™
on
I have been trying the following code but struggling to scrape the second or following pages  from the website

http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Text;

public partial class MHF2 : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {

    }

    protected void btnSubmit_Click1(object sender, EventArgs e)
    {
        txtEventTarget.Text = "";
        txtEventArgument.Text = "";
        txtViewState.Text = "";
        txtEventValidation.Text = "";

        string url = "http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ";
        string html = "";

        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        request.ContentType="application/x-www-form-urlencoded";
        request.Referer = "http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ";
        
        


        html = GetResponse(url, ref request);

        // txtResponse1.Text = html;

        string[] data = ParseHTML(html);

        txtEventTarget.Text = data[0];
        txtEventArgument.Text = data[1];

        txtResponse2.Text = PostRequest(url, data, ref request);
    }

    private string PostRequest(string url, string[] args, ref HttpWebRequest request)
    {
        ASCIIEncoding encoding = new ASCIIEncoding();

        string postData = "__EVENTTARGET=" + args[0] + "&__EVENTARGUMENT=" + args[1];
        postData += "&__VIEWSTATE=" + args[2] + "&__EVENTVALIDATION=" + args[3];

        txtPostBack.Text = postData;

        byte[] data = encoding.GetBytes(postData);

        request = (HttpWebRequest)WebRequest.Create(url);
        request.Method = "POST";
        request.ContentType = "application/x-www-form-urlencoded";
        request.ContentLength = data.Length;
        request.Referer = url;

        Stream newStream = request.GetRequestStream();
        // Send the data.
        try
        {
            newStream.Write(data, 0, data.Length);
            newStream.Close();
        }
        catch (Exception ex)
        {
            Response.Write(ex.StackTrace);
        }
        finally
        {
            newStream.Close();
        }

        return GetResponse(url, ref request);
    }

    private string GetResponse(string url, ref HttpWebRequest request)
    {
        StringBuilder sb = new StringBuilder();
        Stream resStream = null;
        HttpWebResponse response = null;
        byte[] buf = new byte[8192];

        try
        {
            // execute the request
            response = (HttpWebResponse)request.GetResponse();

            // we will read data via the response stream
            resStream = response.GetResponseStream();
            string tempString = null;
            int count = 0;
            do
            {
                // fill the buffer with data
                count = resStream.Read(buf, 0, buf.Length);
                // make sure we read some data
                if (count != 0)
                {
                    // translate from bytes to ASCII text
                    tempString = Encoding.ASCII.GetString(buf, 0, count);
                    // continue building the string
                    sb.Append(tempString);
                }
            }
            while (count > 0); // any more data to read?
        }
        catch (Exception err)
        {
            String exc = err.Message;
        }
        finally
        {
            response.Close();
            resStream.Close();
        }

        return sb.ToString();
    }

    private string[] ParseHTML(string html)
    {
        string[] data = new string[4];
        string value = "";
        string temp = "";
        Match match;

        //Set the EVENTTARGET control
        data[0] = "ctl00%24m%24g_06bd18ef_887b_4e8f_a9ba_e08b261f48b2%24ctl00%24a_oGrid";

        //Set the EVENTARGUMENT, should be an empty string
        data[1] = "";

        //get the ViewState data
        Regex regex = new Regex("id=\"__VIEWSTATE\" value=\"/[a-zA-Z0-9\\W]+\"\\s/>");
        match = regex.Match(html);
        value = match.Value;
        temp = value.Remove(value.IndexOf("id"), 24);
        temp = temp.Remove(temp.LastIndexOf("\""), 4);
       txtViewState.Text = temp;

        temp = temp.Replace("/", "%2F");
        temp = temp.Replace("+", "%2B");
        temp = temp.Replace("=", "%3D");
        data[2] = temp;

       


        //get the EVENTVALIDATION data
        regex = new Regex("id=\"__EVENTVALIDATION\" value=\"/[a-zA-Z0-9\\W]+\"\\s/>");
        match = regex.Match(html);
        value = match.Value;
        temp = value.Remove(value.IndexOf("id"), 30);
        temp = temp.Remove(temp.LastIndexOf("\""), 4);
        txtEventValidation.Text = temp;

        temp = temp.Replace("/", "%2F");
        temp = temp.Replace("+", "%2B");
        temp = temp.Replace("=", "%3D");
        data[3] = temp;

       
        return data;

      
    }




}

Open in new window


And the front end is


<%@ Page Language="C#" AutoEventWireup="true" CodeFile="MHF2.aspx.cs" Inherits="MHF2" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
    <title></title>
</head>
<body>
    <form id="form1" runat="server">
    <div>
        <asp:TextBox ID="txtEventTarget" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtEventArgument" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtViewState" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtResponse2" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtEventValidation" runat="server"></asp:TextBox><br />
          <asp:TextBox ID="txtPostBack" runat="server"></asp:TextBox><br />
        <asp:Button ID="btnSubmit" runat="server" Text="Submit"
            onclick="btnSubmit_Click1" />

    </div>
    </form>
</body>
</html>
Comment
Watch Question

Do more with

Expert Office
EXPERT OFFICE® is a registered trademark of EXPERTS EXCHANGE®
Commented:
Since no ones given an answer. I will ask.

What exactly do you want to extract?  The inner text? Formatted or not?
And so forth?

Are you willing to use PHP? ANd is this a 1 time thing?



See attached

Selvol
Measures.txt

Author

Commented:
thanks a lot for the comment selvol i really need to solve this prb.

I need to extract hyper-links from these pages. e.g.

http://www.ihi.org/knowledge/Pages/Measures/RegistrySize.aspx
http://www.ihi.org/knowledge/Pages/Measures/MeasuresRapidResponseTeams.aspx and so on

There are quite a lot websites which have implemented asp.net gridviews with java script paging and I need links from those as well.

its not a one time thing there are plenty of sites where i need this mechanism. I have not done any development in php but if there is a straight forward solution in php then i don't mind learning and using it.

many thanks again.

Author

Commented:
Inner text would be fine as well. thanks
Miguel OzSenior Software Engineer
Top Expert 2009

Commented:
Check:
http://www.dotnetperls.com/scraping-html
the above link downloads the page as string and then it has an example on how to search for hyperlinks embeeded in hyperlink tags such as the "More on on this topic" box.

for more examples check:
http://www.codersource.net/MicrosoftNet/CAdvanced/HTMLScreenScrapinginC.aspx

Do more with

Expert Office
Submit tech questions to Ask the Experts™ at any time to receive solutions, advice, and new ideas from leading industry professionals.

Start 7-Day Free Trial