Avatar of mmalik15
mmalik15

asked on 

Can any .net guru help in writing post parameters for screen scrapping a website

I have been trying the following code but struggling to scrape the second or following pages  from the website

http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Text;

public partial class MHF2 : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {

    }

    protected void btnSubmit_Click1(object sender, EventArgs e)
    {
        txtEventTarget.Text = "";
        txtEventArgument.Text = "";
        txtViewState.Text = "";
        txtEventValidation.Text = "";

        string url = "http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ";
        string html = "";

        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        request.ContentType="application/x-www-form-urlencoded";
        request.Referer = "http://www.ihi.org/knowledge/pages/ViewAll.aspx?FilterField1=IHI_x0020_Content_x0020_Type&FilterValue1=Measures&Filter1ChainingOperator=And&Targ";
        
        


        html = GetResponse(url, ref request);

        // txtResponse1.Text = html;

        string[] data = ParseHTML(html);

        txtEventTarget.Text = data[0];
        txtEventArgument.Text = data[1];

        txtResponse2.Text = PostRequest(url, data, ref request);
    }

    private string PostRequest(string url, string[] args, ref HttpWebRequest request)
    {
        ASCIIEncoding encoding = new ASCIIEncoding();

        string postData = "__EVENTTARGET=" + args[0] + "&__EVENTARGUMENT=" + args[1];
        postData += "&__VIEWSTATE=" + args[2] + "&__EVENTVALIDATION=" + args[3];

        txtPostBack.Text = postData;

        byte[] data = encoding.GetBytes(postData);

        request = (HttpWebRequest)WebRequest.Create(url);
        request.Method = "POST";
        request.ContentType = "application/x-www-form-urlencoded";
        request.ContentLength = data.Length;
        request.Referer = url;

        Stream newStream = request.GetRequestStream();
        // Send the data.
        try
        {
            newStream.Write(data, 0, data.Length);
            newStream.Close();
        }
        catch (Exception ex)
        {
            Response.Write(ex.StackTrace);
        }
        finally
        {
            newStream.Close();
        }

        return GetResponse(url, ref request);
    }

    private string GetResponse(string url, ref HttpWebRequest request)
    {
        StringBuilder sb = new StringBuilder();
        Stream resStream = null;
        HttpWebResponse response = null;
        byte[] buf = new byte[8192];

        try
        {
            // execute the request
            response = (HttpWebResponse)request.GetResponse();

            // we will read data via the response stream
            resStream = response.GetResponseStream();
            string tempString = null;
            int count = 0;
            do
            {
                // fill the buffer with data
                count = resStream.Read(buf, 0, buf.Length);
                // make sure we read some data
                if (count != 0)
                {
                    // translate from bytes to ASCII text
                    tempString = Encoding.ASCII.GetString(buf, 0, count);
                    // continue building the string
                    sb.Append(tempString);
                }
            }
            while (count > 0); // any more data to read?
        }
        catch (Exception err)
        {
            String exc = err.Message;
        }
        finally
        {
            response.Close();
            resStream.Close();
        }

        return sb.ToString();
    }

    private string[] ParseHTML(string html)
    {
        string[] data = new string[4];
        string value = "";
        string temp = "";
        Match match;

        //Set the EVENTTARGET control
        data[0] = "ctl00%24m%24g_06bd18ef_887b_4e8f_a9ba_e08b261f48b2%24ctl00%24a_oGrid";

        //Set the EVENTARGUMENT, should be an empty string
        data[1] = "";

        //get the ViewState data
        Regex regex = new Regex("id=\"__VIEWSTATE\" value=\"/[a-zA-Z0-9\\W]+\"\\s/>");
        match = regex.Match(html);
        value = match.Value;
        temp = value.Remove(value.IndexOf("id"), 24);
        temp = temp.Remove(temp.LastIndexOf("\""), 4);
       txtViewState.Text = temp;

        temp = temp.Replace("/", "%2F");
        temp = temp.Replace("+", "%2B");
        temp = temp.Replace("=", "%3D");
        data[2] = temp;

       


        //get the EVENTVALIDATION data
        regex = new Regex("id=\"__EVENTVALIDATION\" value=\"/[a-zA-Z0-9\\W]+\"\\s/>");
        match = regex.Match(html);
        value = match.Value;
        temp = value.Remove(value.IndexOf("id"), 30);
        temp = temp.Remove(temp.LastIndexOf("\""), 4);
        txtEventValidation.Text = temp;

        temp = temp.Replace("/", "%2F");
        temp = temp.Replace("+", "%2B");
        temp = temp.Replace("=", "%3D");
        data[3] = temp;

       
        return data;

      
    }




}

Open in new window


And the front end is


<%@ Page Language="C#" AutoEventWireup="true" CodeFile="MHF2.aspx.cs" Inherits="MHF2" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
    <title></title>
</head>
<body>
    <form id="form1" runat="server">
    <div>
        <asp:TextBox ID="txtEventTarget" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtEventArgument" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtViewState" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtResponse2" runat="server"></asp:TextBox><br />
        <asp:TextBox ID="txtEventValidation" runat="server"></asp:TextBox><br />
          <asp:TextBox ID="txtPostBack" runat="server"></asp:TextBox><br />
        <asp:Button ID="btnSubmit" runat="server" Text="Submit"
            onclick="btnSubmit_Click1" />

    </div>
    </form>
</body>
</html>
C#ASP.NETJavaScript

Avatar of undefined
Last Comment
Miguel Oz
ASKER CERTIFIED SOLUTION
Avatar of James Williams
James Williams
Flag of United States of America image

Blurred text
THIS SOLUTION IS ONLY AVAILABLE TO MEMBERS.
View this solution by signing up for a free trial.
Members can start a 7-Day free trial and enjoy unlimited access to the platform.
See Pricing Options
Start Free Trial
Avatar of mmalik15
mmalik15

ASKER

thanks a lot for the comment selvol i really need to solve this prb.

I need to extract hyper-links from these pages. e.g.

http://www.ihi.org/knowledge/Pages/Measures/RegistrySize.aspx
http://www.ihi.org/knowledge/Pages/Measures/MeasuresRapidResponseTeams.aspx and so on

There are quite a lot websites which have implemented asp.net gridviews with java script paging and I need links from those as well.

its not a one time thing there are plenty of sites where i need this mechanism. I have not done any development in php but if there is a straight forward solution in php then i don't mind learning and using it.

many thanks again.
Avatar of mmalik15
mmalik15

ASKER

Inner text would be fine as well. thanks
Avatar of Miguel Oz
Miguel Oz
Flag of Australia image

Check:
http://www.dotnetperls.com/scraping-html
the above link downloads the page as string and then it has an example on how to search for hyperlinks embeeded in hyperlink tags such as the "More on on this topic" box.

for more examples check:
http://www.codersource.net/MicrosoftNet/CAdvanced/HTMLScreenScrapinginC.aspx
ASP.NET
ASP.NET

The successor to Active Server Pages, ASP.NET websites utilize the .NET framework to produce dynamic, data and content-driven web applications and services. ASP.NET code can be written using any .NET supported language. As of 2009, ASP.NET can also apply the Model-View-Controller (MVC) pattern to web applications

128K
Questions
--
Followers
--
Top Experts
Get a personalized solution from industry experts
Ask the experts
Read over 600 more reviews

TRUSTED BY

IBM logoIntel logoMicrosoft logoUbisoft logoSAP logo
Qualcomm logoCitrix Systems logoWorkday logoErnst & Young logo
High performer badgeUsers love us badge
LinkedIn logoFacebook logoX logoInstagram logoTikTok logoYouTube logo