<span class="pagerlink">
<a href="#" id="next" title="Go to the next page">Next</a>
</span>
import time
from selenium import webdriver
url = "https://manulife.taleo.net/careersection/external_global/jobsearch.ftl?lang=en&location=1038#"
driver = webdriver.PhantomJS()
driver.get(url)
time.sleep(2.0)
table = driver.find_element_by_id('jobs')
for e in table.find_elements_by_xpath("tbody/tr/th[1]"):
print(e.text)
a = driver.find_element_by_id("next")
a.click()
time.sleep(1.0)
table = driver.find_element_by_id('jobs')
for e in table.find_elements_by_xpath("tbody/tr/th[1]"):
print(e.text)
driver.quit()
Use Firebug or Chrome dev tools to see what AJAX requests are created by the website, then recreate these AJAX requests from your script. Often scraping AJAX dependent websites is easier than "static" websites because these AJAX requests return the data in well structured JSON - the data is separated from the presentation. - source
curl -o response.txt ^
--data-binary @request.txt ^
-H "Host: manulife.taleo.net" ^
-H "Accept: application/json, text/javascript, */*; q=0.01" ^
-H "Content-Type: application/json" ^
"https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072"
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup
URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'
REQUEST_TEMPLATE = {
'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
{'id': 'ORGANIZATION',
'selectedValues': []},
{'id': 'LOCATION',
'selectedValues': []},
{'id': 'JOB_FIELD',
'selectedValues': []},
{'id': 'JOB_NUMBER',
'selectedValues': []},
{'id': 'URGENT_JOB',
'selectedValues': []},
{'id': 'EMPLOYEE_STATUS',
'selectedValues': []}]},
'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
'filterSelectionParam': {'searchFilterSelections': [
{'id': 'POSTING_DATE',
'selectedValues': []},
{'id': 'LOCATION',
'selectedValues': []},
{'id': 'JOB_FIELD',
'selectedValues': []},
{'id': 'JOB_TYPE',
'selectedValues': []},
{'id': 'JOB_SCHEDULE',
'selectedValues': []}]},
'multilineEnabled': False,
'pageNo': 0,
'sortingSelection': {
'ascendingSortingOrder': 'false',
'sortBySelectionParam': '1'}}
HEADERS = {
'Host': 'manulife.taleo.net',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/json',
}
def getRequestObject(pageNo):
o = REQUEST_TEMPLATE.copy()
o['pageNo'] = pageNo
return o
class MySpider(scrapy.Spider):
name = 'jobsearch'
allowed_domains = ['manulife.taleo.net']
def start_requests(self):
for pageNo in range(1, 8): # how do we know to stop at page 7?
o = getRequestObject(pageNo)
yield scrapy.Request(URL, callback=self.parse, method='POST', headers=HEADERS, body=json.dumps(o))
def parse(self, response):
d = json.loads(response.body)
pd = d['pagingData']
currentPageNo = pd['currentPageNo']
pageSize = pd['pageSize']
totalCount = pd['totalCount']
pageCount = int(math.ceil(1.0*totalCount/pageSize)) # here we can calculate page count!
self.log('*** page %d of %d' % (currentPageNo, pageCount))
for row in d['requisitionList']:
self.log('\t'.join(row['column']))
Still haven't figured out how to get the number of pages in the start_requests method.Figured it out based on short sample code and guessing!
# -*- coding: utf-8 -*-
import json, math
import scrapy
from bs4 import BeautifulSoup
URL = 'https://manulife.taleo.net/careersection/rest/jobboard/searchjobs?lang=en&portal=64140181072'
REQUEST_TEMPLATE = {
'advancedSearchFiltersSelectionParam': {'searchFilterSelections': [
{'id': 'ORGANIZATION',
'selectedValues': []},
{'id': 'LOCATION',
'selectedValues': []},
{'id': 'JOB_FIELD',
'selectedValues': []},
{'id': 'JOB_NUMBER',
'selectedValues': []},
{'id': 'URGENT_JOB',
'selectedValues': []},
{'id': 'EMPLOYEE_STATUS',
'selectedValues': []}]},
'fieldData': {'fields': {'KEYWORD': '', 'LOCATION': '1038'}, 'valid': True},
'filterSelectionParam': {'searchFilterSelections': [
{'id': 'POSTING_DATE',
'selectedValues': []},
{'id': 'LOCATION',
'selectedValues': []},
{'id': 'JOB_FIELD',
'selectedValues': []},
{'id': 'JOB_TYPE',
'selectedValues': []},
{'id': 'JOB_SCHEDULE',
'selectedValues': []}]},
'multilineEnabled': False,
'pageNo': 0,
'sortingSelection': {
'ascendingSortingOrder': 'false',
'sortBySelectionParam': '1'}}
HEADERS = {
'Host': 'manulife.taleo.net',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/json',
}
def getRequestObject(pageNo):
o = REQUEST_TEMPLATE.copy()
o['pageNo'] = pageNo
return o
class MySpider(scrapy.Spider):
name = 'jobsearch'
pageCount = 0
def showResults(self, d):
pd = d['pagingData']
currentPageNo = pd['currentPageNo']
self.log('*** page %d of %d' % (currentPageNo, self.pageCount))
for row in d['requisitionList']:
self.log('\t'.join(row['column']))
def getPageCount(self, response):
d = json.loads(response.body)
pd = d['pagingData']
pageSize = pd['pageSize']
totalCount = pd['totalCount']
self.pageCount = int(math.ceil(1.0*totalCount/pageSize)) # here we can calculate page count!
self.log('*** Pages: %d' % (self.pageCount,))
self.showResults(d)
for pageNo in range(2, self.pageCount+1):
o = getRequestObject(pageNo)
yield scrapy.Request(URL, callback=self.parse, method='POST', headers=HEADERS, body=json.dumps(o))
def start_requests(self):
o = getRequestObject(1)
return [scrapy.Request(URL, callback=self.getPageCount, method='POST', headers=HEADERS, body=json.dumps(o))]
def parse(self, response):
d = json.loads(response.body)
self.showResults(d)