Avatar of mmalik15
mmalik15
 asked on

Can any .net guru help in writing post parameters for screen scrapping a website

I have been trying the following code but struggling to scrape the second or following pages  from the website

http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Text;

public partial class MHF2 : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {

    }

    protected void btnSubmit_Click1(object sender, EventArgs e)
    {
        txtEventTarget.Text = "";
        txtEventArgument.Text = "";
        txtViewState.Text = "";
        txtEventValidation.Text = "";

        string url = "http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ";
        string html = "";

        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        request.ContentType="application/x-www-form-urlencoded";
        request.Referer = "http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ";
        
        


        html = GetResponse(url, ref request);

        // txtResponse1.Text = html;

        string[] data = ParseHTML(html);

        txtEventTarget.Text = data[0];
        txtEventArgument.Text = data[1];

        txtResponse2.Text = PostRequest(url, data, ref request);
    }

    private string PostRequest(string url, string[] args, ref HttpWebRequest request)
    {
        ASCIIEncoding encoding = new ASCIIEncoding();

        string postData = "__EVENTTARGET=" + args[0] + "&__EVENTARGUMENT=" + args[1];
        postData += "&__VIEWSTATE=" + args[2] + "&__EVENTVALIDATION=" + args[3];

        txtPostBack.Text = postData;

        byte[] data = encoding.GetBytes(postData);

        request = (HttpWebRequest)WebRequest.Create(url);
        request.Method = "POST";
        request.ContentType = "application/x-www-form-urlencoded";
        request.ContentLength = data.Length;
        request.Referer = url;

        Stream newStream = request.GetRequestStream();
        // Send the data.
        try
        {
            newStream.Write(data, 0, data.Length);
            newStream.Close();
        }
        catch (Exception ex)
        {
            Response.Write(ex.StackTrace);
        }
        finally
        {
            newStream.Close();
        }

        return GetResponse(url, ref request);
    }

    private string GetResponse(string url, ref HttpWebRequest request)
    {
        StringBuilder sb = new StringBuilder();
        Stream resStream = null;
        HttpWebResponse response = null;
        byte[] buf = new byte[8192];

        try
        {
            // execute the request
            response = (HttpWebResponse)request.GetResponse();

            // we will read data via the response stream
            resStream = response.GetResponseStream();
            string tempString = null;
            int count = 0;
            do
            {
                // fill the buffer with data
                count = resStream.Read(buf, 0, buf.Length);
                // make sure we read some data
                if (count != 0)
                {
                    // translate from bytes to ASCII text
                    tempString = Encoding.ASCII.GetString(buf, 0, count);
                    // continue building the string
                    sb.Append(tempString);
                }
            }
            while (count > 0); // any more data to read?
        }
        catch (Exception err)
        {
            String exc = err.Message;
        }
        finally
        {
            response.Close();
            resStream.Close();
        }

        return sb.ToString();
    }

    private string[] ParseHTML(string html)
    {
        string[] data = new string[4];
        string value = "";
        string temp = "";
        Match match;

        //Set the EVENTTARGET control
        data[0] = "ctl00%24m%24g_06bd18ef_887b_4e8f_a9ba_e08b261f48b2%24ctl00%24a_oGrid";

        //Set the EVENTARGUMENT, should be an empty string
        data[1] = "";

        //get the ViewState data
        Regex regex = new Regex("id=\"__VIEWSTATE\" value=\"/[a-zA-Z0-9\\W]+\"\\s/>");
        match = regex.Match(html);
        value = match.Value;
        temp = value.Remove(value.IndexOf("id"), 24);
        temp = temp.Remove(temp.LastIndexOf("\""), 4);
       txtViewState.Text = temp;

        temp = temp.Replace("/", "%2F");
        temp = temp.Replace("+", "%2B");
        temp = temp.Replace("=", "%3D");
        data[2] = temp;

       


        //get the EVENTVALIDATION data
        regex = new Regex("id=\"__EVENTVALIDATION\" value=\"/[a-zA-Z0-9\\W]+\"\\s/>");
        match = regex.Match(html);
        value = match.Value;
        temp = value.Remove(value.IndexOf("id"), 30);
        temp = temp.Remove(temp.LastIndexOf("\""), 4);
        txtEventValidation.Text = temp;

        temp = temp.Replace("/", "%2F");
        temp = temp.Replace("+", "%2B");
        temp = temp.Replace("=", "%3D");
        data[3] = temp;

       
        return data;

      
    }




}

Open in new window


And the front end is


<%@ Page Language="C#" AutoEventWireup="true" CodeFile="MHF2.aspx.cs" Inherits="MHF2" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
    <title></title>
</head>
<body>
    <form id="form1" runat="server">
    <div>
        <asp:TextBox ID="txtEventTarget" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtEventArgument" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtViewState" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtResponse2" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtEventValidation" runat="server"></asp:TextBox><br />
          <asp:TextBox ID="txtPostBack" runat="server"></asp:TextBox><br />
        <asp:Button ID="btnSubmit" runat="server" Text="Submit"
            onclick="btnSubmit_Click1" />

    </div>
    </form>
</body>
</html>
C#ASP.NETJavaScript

Avatar of undefined
Last Comment
Miguel Oz

8/22/2022 - Mon
ASKER CERTIFIED SOLUTION
James Williams

Log in or sign up to see answer
Become an EE member today7-DAY FREE TRIAL
Members can start a 7-Day Free trial then enjoy unlimited access to the platform
Sign up - Free for 7 days
or
Learn why we charge membership fees
We get it - no one likes a content blocker. Take one extra minute and find out why we block content.
Not exactly the question you had in mind?
Sign up for an EE membership and get your own personalized solution. With an EE membership, you can ask unlimited troubleshooting, research, or opinion questions.
ask a question
mmalik15

ASKER
thanks a lot for the comment selvol i really need to solve this prb.

I need to extract hyper-links from these pages. e.g.

http://www.ihi.org/knowledge/Pages/Measures/RegistrySize.aspx
http://www.ihi.org/knowledge/Pages/Measures/MeasuresRapidResponseTeams.aspx and so on

There are quite a lot websites which have implemented asp.net gridviews with java script paging and I need links from those as well.

its not a one time thing there are plenty of sites where i need this mechanism. I have not done any development in php but if there is a straight forward solution in php then i don't mind learning and using it.

many thanks again.
mmalik15

ASKER
Inner text would be fine as well. thanks
Miguel Oz

Check:
http://www.dotnetperls.com/scraping-html
the above link downloads the page as string and then it has an example on how to search for hyperlinks embeeded in hyperlink tags such as the "More on on this topic" box.

for more examples check:
http://www.codersource.net/MicrosoftNet/CAdvanced/HTMLScreenScrapinginC.aspx
Experts Exchange has (a) saved my job multiple times, (b) saved me hours, days, and even weeks of work, and often (c) makes me look like a superhero! This place is MAGIC!
Walt Forbes