Link to home
Create AccountLog in
Avatar of ashish kapil
ashish kapil

asked on

scrape link containing <a herf = "#"> using scrapy

I am scraping https://manulife.taleo.net/careersection/external_global/jobsearch.ftl?lang=en&location=1038# , when i inspect the next button's link , I get :
<span class="pagerlink">
				<a href="#" id="next" title="Go to the next page">Next</a>
			</span>

Open in new window


Can you help with it, how can I perform pagination in this website.
Avatar of Flabio Gates
Flabio Gates

The pagination is handled by javascript so I would suggest you try selenium.
to get you started:
import time
from selenium import webdriver

url = "https://manulife.taleo.net/careersection/external_global/jobsearch.ftl?lang=en&location=1038#"

driver = webdriver.PhantomJS()

driver.get(url)
time.sleep(2.0)

table = driver.find_element_by_id('jobs')
for e in table.find_elements_by_xpath("tbody/tr/th[1]"):
    print(e.text)

a = driver.find_element_by_id("next")
a.click()
time.sleep(1.0)

table = driver.find_element_by_id('jobs')
for e in table.find_elements_by_xpath("tbody/tr/th[1]"):
    print(e.text)

driver.quit()

Open in new window

If you are stuck with scrapy, here's one approach.
Another approach suggests
Use Firebug or Chrome dev tools to see what AJAX requests are created by the website, then recreate these AJAX requests from your script. Often scraping AJAX dependent websites is easier than "static" websites because these AJAX requests return the data in well structured JSON - the data is separated from the presentation. - source
Captured the traffic (request/response) to and from the site when the next link is clicked.
curl -o response.txt ^
    --data-binary @request.txt ^
    -H "Host: manulife.taleo.net" ^
    -H "Accept: application/json, text/javascript, */*; q=0.01" ^
    -H "Content-Type: application/json" ^
    "https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072"

Open in new window


So from scrapy, you need to POST a json string (attached request.txt) to the URL https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072 and you'll get a JSON string back (attached response.txt) containing (among other things) the data the page uses to populate the table.

At the end of request.txt, is the "pageNo" value which you can increment.
request.txt
response.txt
Found that I need to learn scrapy too.
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup

URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'

REQUEST_TEMPLATE = {
    'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
        {'id': 'ORGANIZATION',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_NUMBER',
         'selectedValues': []},
        {'id': 'URGENT_JOB',
         'selectedValues': []},
        {'id': 'EMPLOYEE_STATUS',
         'selectedValues': []}]},
    'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
    'filterSelectionParam': {'searchFilterSelections': [
        {'id': 'POSTING_DATE',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_TYPE',
         'selectedValues': []},
        {'id': 'JOB_SCHEDULE',
         'selectedValues': []}]},
    'multilineEnabled': False,
    'pageNo': 0,
    'sortingSelection': {
        'ascendingSortingOrder': 'false',
        'sortBySelectionParam': '1'}}

HEADERS = {
    'Host': 'manulife.taleo.net',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/json',
    }

def getRequestObject(pageNo):
    o = REQUEST_TEMPLATE.copy()
    o['pageNo'] = pageNo

    return o

class MySpider(scrapy.Spider):
    name = 'jobsearch'
    allowed_domains = ['manulife.taleo.net']

    def start_requests(self):
        for pageNo in range(1, 8): # how do we know to stop at page 7?
            o = getRequestObject(pageNo)
            yield scrapy.Request(URL, callback=self.parse, method='POST', headers=HEADERS, body=json.dumps(o))

    def parse(self, response):
        d = json.loads(response.body)
        pd = d['pagingData']
        currentPageNo = pd['currentPageNo']
        pageSize = pd['pageSize']
        totalCount = pd['totalCount']

        pageCount = int(math.ceil(1.0*totalCount/pageSize)) # here we can calculate page count!

        self.log('*** page %d of %d' % (currentPageNo, pageCount))
        for row in d['requisitionList']:
            self.log('\t'.join(row['column']))

Open in new window


Still haven't figured out how to get the number of pages in the start_requests method.
Still haven't figured out how to get the number of pages in the start_requests method.
Figured it out based on short sample code and guessing!
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup

URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'

REQUEST_TEMPLATE = {
    'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
        {'id': 'ORGANIZATION',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_NUMBER',
         'selectedValues': []},
        {'id': 'URGENT_JOB',
         'selectedValues': []},
        {'id': 'EMPLOYEE_STATUS',
         'selectedValues': []}]},
    'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
    'filterSelectionParam': {'searchFilterSelections': [
        {'id': 'POSTING_DATE',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_TYPE',
         'selectedValues': []},
        {'id': 'JOB_SCHEDULE',
         'selectedValues': []}]},
    'multilineEnabled': False,
    'pageNo': 0,
    'sortingSelection': {
        'ascendingSortingOrder': 'false',
        'sortBySelectionParam': '1'}}

HEADERS = {
    'Host': 'manulife.taleo.net',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/json',
    }

def getRequestObject(pageNo):
    o = REQUEST_TEMPLATE.copy()
    o['pageNo'] = pageNo

    return o

class MySpider(scrapy.Spider):
    name = 'jobsearch'
    pageCount = 0

    def showResults(self, d):
        pd = d['pagingData']
        currentPageNo = pd['currentPageNo']

        self.log('*** page %d of %d' % (currentPageNo, self.pageCount))
        for row in d['requisitionList']:
            self.log('\t'.join(row['column']))

    def getPageCount(self, response):
        d = json.loads(response.body)
        pd = d['pagingData']
        pageSize = pd['pageSize']
        totalCount = pd['totalCount']

        self.pageCount = int(math.ceil(1.0*totalCount/pageSize)) # here we can calculate page count!
        self.log('*** Pages: %d' % (self.pageCount,))

        self.showResults(d)

        for pageNo in range(2, self.pageCount+1):
            o = getRequestObject(pageNo)
            yield scrapy.Request(URL, callback=self.parse, method='POST', headers=HEADERS, body=json.dumps(o))

    def start_requests(self):
        o = getRequestObject(1)
        return [scrapy.Request(URL, callback=self.getPageCount, method='POST', headers=HEADERS, body=json.dumps(o))]

    def parse(self, response):
        d = json.loads(response.body)
        self.showResults(d)

Open in new window

ASKER CERTIFIED SOLUTION
Avatar of Flabio Gates
Flabio Gates

Link to home
membership
Create an account to see this answer
Signing up is free. No credit card required.
Create Account
Question inactive