动态网页爬取例子(WebCollector+selenium+phantomjs)

目标:动态网页爬取

说明:这里的动态网页指几种可能:1)需要用户交互,如常见的登录操作;2)网页通过JS / AJAX动态生成,如一个html里有<div id="test"></div>,通过JS生成<div id="test"><span>aaa</span></div>。

这里用了WebCollector 2进行爬虫,这东东也方便,不过要支持动态关键还是要靠另外一个API –selenium 2(集成htmlunit 和phantomjs).

1)需要登录后的爬取,如新浪微博

import java.util.Set;import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;import cn.edu.hfut.dmic.webcollector.model.Links;import cn.edu.hfut.dmic.webcollector.model.Page;import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;import org.openqa.selenium.Cookie;import org.openqa.selenium.WebElement;import org.openqa.selenium.htmlunit.HtmlUnitDriver;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/* * 登录后爬取 * Refer: * https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md * Lib required: webcollector-2.07-bin, selenium-java-2.44.0 & its lib */public class WebCollector1 extends DeepCrawler {public WebCollector1(String crawlPath) {super(crawlPath);/*获取新浪微博的cookie,账号密码以明文形式传输,请使用小号*/try {String cookie=WebCollector1.WeiboCN.getSinaCookie("yourAccount", "yourPwd");HttpRequesterImpl myRequester=(HttpRequesterImpl) this.getHttpRequester();myRequester.setCookie(cookie);} catch (Exception e) {e.printStackTrace();}}@Overridepublic Links visitAndGetNextLinks(Page page) {/*抽取微博*/Elements weibos=page.getDoc().select("div.c");for(Element weibo:weibos){System.out.println(weibo.text());}/*如果要爬取评论,这里可以抽取评论页面的URL,返回*/return null;}public static void main(String[] args) {WebCollector1 crawler=new WebCollector1("/home/hu/data/weibo");crawler.setThreads(3);/*对某人微博前5页进行爬取*/for(int i=0;i<5;i++){crawler.addSeed("?vt=4&page="+i);}try {crawler.start(1);} catch (Exception e) {e.printStackTrace();}}public static class WeiboCN {/*** 获取新浪微博的cookie,这个方法针对weibo.cn有效,对weibo.com无效* weibo.cn以明文形式传输数据,请使用小号* @param username 新浪微博用户名* @param password 新浪微博密码* @return* @throws Exception*/public static String getSinaCookie(String username, String password) throws Exception{StringBuilder sb = new StringBuilder();HtmlUnitDriver driver = new HtmlUnitDriver();driver.setJavascriptEnabled(true);driver.get("");WebElement mobile = driver.findElementByCssSelector("input[name=mobile]");mobile.sendKeys(username);WebElement pass = driver.findElementByCssSelector("input[name^=password]");pass.sendKeys(password);WebElement rem = driver.findElementByCssSelector("input[name=remember]");rem.click();WebElement submit = driver.findElementByCssSelector("input[name=submit]");submit.click();Set<Cookie> cookieSet = driver.manage().getCookies();driver.close();for (Cookie cookie : cookieSet) {sb.append(cookie.getName()+"="+cookie.getValue()+";");}String result=sb.toString();if(result.contains("gsid_CTandWM")){return result;}else{throw new Exception("weibo login failed");}}}}

* 这里有个自定义路径/home/hu/data/weibo(WebCollector1 crawler=new WebCollector1("/home/hu/data/weibo");),是用来保存到嵌入式数据库Berkeley DB。

* 总体上来自Webcollector 作者的sample。

2)JS动态生成HTML元素的爬取

import java.util.List;import org.openqa.selenium.By;import org.openqa.selenium.WebDriver;import org.openqa.selenium.WebElement;import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;import cn.edu.hfut.dmic.webcollector.model.Links;import cn.edu.hfut.dmic.webcollector.model.Page;/* * JS爬取 * Refer: */public class WebCollector3 extends DeepCrawler {public WebCollector3(String crawlPath) {super(crawlPath);// TODO Auto-generated constructor stub}@Overridepublic Links visitAndGetNextLinks(Page page) {/*HtmlUnitDriver可以抽取JS生成的数据*///HtmlUnitDriver driver=PageUtils.getDriver(page,BrowserVersion.CHROME);//String content = PageUtils.getPhantomJSDriver(page);WebDriver driver = PageUtils.getWebDriver(page);//List<WebElement> divInfos=driver.findElementsByCssSelector("#feed_content");List<WebElement> divInfos=driver.findElements(By.cssSelector("#feed_content span"));for(WebElement divInfo:divInfos){System.out.println("Text是:" + divInfo.getText());}return null;}public static void main(String[] args) {WebCollector3 crawler=new WebCollector3("/home/hu/data/wb");for(int page=1;page<=5;page++)//crawler.addSeed("?query="+URLEncoder.encode("编程")+"&page="+page);crawler.addSeed("?294064");try {crawler.start(1);} catch (Exception e) {e.printStackTrace();}}}PageUtils.java

希望有一天,自己也像他们一样,踩着单车上路,

动态网页爬取例子(WebCollector+selenium+phantomjs)

相关文章:

你感兴趣的文章:

标签云: