trying scrape site has scripts run after page load jsdom doesn't seem this, can working zombie.js (which uses jsdom under hood). not use zombie js, don't need testing framework stuff.
this jsdom code:
const request = require('request'); const jsdom = require('jsdom'); const cheerio = require('cheerio'); const domain = 'https://www.google.co.uk' const usa = 'mozilla/5.0 (windows nt 6.3; wow64) applewebkit/537.36 (khtml, gecko) chrome/38.0.2125.111 safari/537.36'; module.exports = function(cb) { const url = `${domain}/#q=monkeys`; request({ uri: url, headers: { 'user-agent': usa } }, function (err, res, body) { if (err && res.statuscode !== 200) throw err; const window = jsdom.jsdom(body, { url: url, useragent: usa, features: { fetchexternalresources: ['script'], processexternalresources: ['script'] } }).defaultview; window.addeventlistener('load', () => { // console.log(window.document.body.innerhtml) // html has scripts. let $ = cheerio.load(window.document); let els = $('selector').map(function () { // other code }).get(); cb(url, els); window.close(); }); }); }
this zombie js code works:
const browser = require('zombie'); const cheerio = require('cheerio'); const domain = 'https://www.google.co.uk' const usa = 'mozilla/5.0 (windows nt 6.3; wow64) applewebkit/537.36 (khtml, gecko) chrome/38.0.2125.111 safari/537.36'; module.exports = function(cb) { const url = `${domain}/#q=monkeys`; const browser = new browser({useragent: usa}); browser.visit(url, () => { browser.wait(window => window.document.getelementbyid('rhs'), function() { let $ = cheerio.load(browser.html()); let els = $('selector').map(function () { // other code }).get(); cb(url, els) browser.window.close(); }); }); }
Comments
Post a Comment