scrape link containing <a herf = "#"> using scrapy

ashish kapil
ashish kapil used Ask the Experts™
on
I am scraping https://manulife.taleo.net/careersection/external_global/jobsearch.ftl?lang=en&location=1038# , when i inspect the next button's link , I get :
<span class="pagerlink">
				<a href="#" id="next" title="Go to the next page">Next</a>
			</span>

Open in new window


Can you help with it, how can I perform pagination in this website.
Comment
Watch Question

Do more with

Expert Office
EXPERT OFFICE® is a registered trademark of EXPERTS EXCHANGE®
The pagination is handled by javascript so I would suggest you try selenium.
to get you started:
import time
from selenium import webdriver

url = "https://manulife.taleo.net/careersection/external_global/jobsearch.ftl?lang=en&location=1038#"

driver = webdriver.PhantomJS()

driver.get(url)
time.sleep(2.0)

table = driver.find_element_by_id('jobs')
for e in table.find_elements_by_xpath("tbody/tr/th[1]"):
    print(e.text)

a = driver.find_element_by_id("next")
a.click()
time.sleep(1.0)

table = driver.find_element_by_id('jobs')
for e in table.find_elements_by_xpath("tbody/tr/th[1]"):
    print(e.text)

driver.quit()

Open in new window

If you are stuck with scrapy, here's one approach.
Another approach suggests
Use Firebug or Chrome dev tools to see what AJAX requests are created by the website, then recreate these AJAX requests from your script. Often scraping AJAX dependent websites is easier than "static" websites because these AJAX requests return the data in well structured JSON - the data is separated from the presentation. - source
Exploring ASP.NET Core: Fundamentals

Learn to build web apps and services, IoT apps, and mobile backends by covering the fundamentals of ASP.NET Core and  exploring the core foundations for app libraries.

Captured the traffic (request/response) to and from the site when the next link is clicked.
curl -o response.txt ^
    --data-binary @request.txt ^
    -H "Host: manulife.taleo.net" ^
    -H "Accept: application/json, text/javascript, */*; q=0.01" ^
    -H "Content-Type: application/json" ^
    "https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072"

Open in new window


So from scrapy, you need to POST a json string (attached request.txt) to the URL https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072 and you'll get a JSON string back (attached response.txt) containing (among other things) the data the page uses to populate the table.

At the end of request.txt, is the "pageNo" value which you can increment.
request.txt
response.txt
Found that I need to learn scrapy too.
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup

URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'

REQUEST_TEMPLATE = {
    'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
        {'id': 'ORGANIZATION',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_NUMBER',
         'selectedValues': []},
        {'id': 'URGENT_JOB',
         'selectedValues': []},
        {'id': 'EMPLOYEE_STATUS',
         'selectedValues': []}]},
    'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
    'filterSelectionParam': {'searchFilterSelections': [
        {'id': 'POSTING_DATE',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_TYPE',
         'selectedValues': []},
        {'id': 'JOB_SCHEDULE',
         'selectedValues': []}]},
    'multilineEnabled': False,
    'pageNo': 0,
    'sortingSelection': {
        'ascendingSortingOrder': 'false',
        'sortBySelectionParam': '1'}}

HEADERS = {
    'Host': 'manulife.taleo.net',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/json',
    }

def getRequestObject(pageNo):
    o = REQUEST_TEMPLATE.copy()
    o['pageNo'] = pageNo

    return o

class MySpider(scrapy.Spider):
    name = 'jobsearch'
    allowed_domains = ['manulife.taleo.net']

    def start_requests(self):
        for pageNo in range(1, 8): # how do we know to stop at page 7?
            o = getRequestObject(pageNo)
            yield scrapy.Request(URL, callback=self.parse, method='POST', headers=HEADERS, body=json.dumps(o))

    def parse(self, response):
        d = json.loads(response.body)
        pd = d['pagingData']
        currentPageNo = pd['currentPageNo']
        pageSize = pd['pageSize']
        totalCount = pd['totalCount']

        pageCount = int(math.ceil(1.0*totalCount/pageSize)) # here we can calculate page count!

        self.log('*** page %d of %d' % (currentPageNo, pageCount))
        for row in d['requisitionList']:
            self.log('\t'.join(row['column']))

Open in new window


Still haven't figured out how to get the number of pages in the start_requests method.
Still haven't figured out how to get the number of pages in the start_requests method.
Figured it out based on short sample code and guessing!
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup

URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'

REQUEST_TEMPLATE = {
    'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
        {'id': 'ORGANIZATION',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_NUMBER',
         'selectedValues': []},
        {'id': 'URGENT_JOB',
         'selectedValues': []},
        {'id': 'EMPLOYEE_STATUS',
         'selectedValues': []}]},
    'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
    'filterSelectionParam': {'searchFilterSelections': [
        {'id': 'POSTING_DATE',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_TYPE',
         'selectedValues': []},
        {'id': 'JOB_SCHEDULE',
         'selectedValues': []}]},
    'multilineEnabled': False,
    'pageNo': 0,
    'sortingSelection': {
        'ascendingSortingOrder': 'false',
        'sortBySelectionParam': '1'}}

HEADERS = {
    'Host': 'manulife.taleo.net',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/json',
    }

def getRequestObject(pageNo):
    o = REQUEST_TEMPLATE.copy()
    o['pageNo'] = pageNo

    return o

class MySpider(scrapy.Spider):
    name = 'jobsearch'
    pageCount = 0

    def showResults(self, d):
        pd = d['pagingData']
        currentPageNo = pd['currentPageNo']

        self.log('*** page %d of %d' % (currentPageNo, self.pageCount))
        for row in d['requisitionList']:
            self.log('\t'.join(row['column']))

    def getPageCount(self, response):
        d = json.loads(response.body)
        pd = d['pagingData']
        pageSize = pd['pageSize']
        totalCount = pd['totalCount']

        self.pageCount = int(math.ceil(1.0*totalCount/pageSize)) # here we can calculate page count!
        self.log('*** Pages: %d' % (self.pageCount,))

        self.showResults(d)

        for pageNo in range(2, self.pageCount+1):
            o = getRequestObject(pageNo)
            yield scrapy.Request(URL, callback=self.parse, method='POST', headers=HEADERS, body=json.dumps(o))

    def start_requests(self):
        o = getRequestObject(1)
        return [scrapy.Request(URL, callback=self.getPageCount, method='POST', headers=HEADERS, body=json.dumps(o))]

    def parse(self, response):
        d = json.loads(response.body)
        self.showResults(d)

Open in new window

Final iteration (just to store my sample scapy code somewhere online):
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup

URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'

REQUEST_TEMPLATE = {
    'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
        {'id': 'ORGANIZATION',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_NUMBER',
         'selectedValues': []},
        {'id': 'URGENT_JOB',
         'selectedValues': []},
        {'id': 'EMPLOYEE_STATUS',
         'selectedValues': []}]},
    'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
    'filterSelectionParam': {'searchFilterSelections': [
        {'id': 'POSTING_DATE',
         'selectedValues': []},
        {'id': 'LOCATION',
         'selectedValues': []},
        {'id': 'JOB_FIELD',
         'selectedValues': []},
        {'id': 'JOB_TYPE',
         'selectedValues': []},
        {'id': 'JOB_SCHEDULE',
         'selectedValues': []}]},
    'multilineEnabled': False,
    'pageNo': 0,
    'sortingSelection': {
        'ascendingSortingOrder': 'false',
        'sortBySelectionParam': '1'}}

HEADERS = {
    'Host': 'manulife.taleo.net',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Content-Type': 'application/json',
    }

def getRequest(pageNo, callback):
    o = REQUEST_TEMPLATE.copy()
    o['pageNo'] = pageNo
    body = json.dumps(o)

    return scrapy.Request(URL, callback=callback, method='POST', headers=HEADERS, body=body)

def getPageCount(pd):
    pageSize = pd['pageSize']
    totalCount = pd['totalCount']

    return int(math.ceil(1.0*totalCount/pageSize))

def getRowObject(row, currentPageNo):
    return {
        'page': currentPageNo,
        'title': row['column'][0],
        'location': row['column'][1],
        }

class MySpider(scrapy.Spider):
    name = 'jobsearch2'

    def getPageCount(self, response):
        d = json.loads(response.body)
        pd = d['pagingData']

        pageCount = getPageCount(pd)
        self.log('*** Pages: %d' % (pageCount,))

        for row in d['requisitionList']:
            yield getRowObject(row, pd['currentPageNo'])

        for pageNo in range(2, pageCount+1):
            yield getRequest(pageNo, self.parse)

    def start_requests(self):
        yield getRequest(1, self.getPageCount)

    def parse(self, response):
        d = json.loads(response.body)
        pd = d['pagingData']
        
        for row in d['requisitionList']:
            yield getRowObject(row, pd['currentPageNo'])

Open in new window

Question inactive

Do more with

Expert Office
Submit tech questions to Ask the Experts™ at any time to receive solutions, advice, and new ideas from leading industry professionals.

Start 7-Day Free Trial