用生产者消费者模式实现爬虫批量提交网页给搜索引擎

1:爬虫:crawler4j ;

参考资料:

参考书籍:自己动手写网络爬虫 –罗刚

2:搜索服务器:solr4.10 ;

3:多线程处理

参考书籍:java并发编程实战

参考相关jdk api:,堵塞队列BlockingQueue<E> 类

业务:爬取国内部分招聘网站的职位信息。。当爬虫线程抓取到的页面数据到一定量时或者一定时间内。提交给搜索引擎solr(提高

部分爬虫相关代码:

package crawler;import edu.uci.ics.crawler4j.crawler.CrawlConfig;import edu.uci.ics.crawler4j.crawler.CrawlController;import edu.uci.ics.crawler4j.fetcher.PageFetcher;import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;public class Crawler4jTest {public static void main(String[] args) {try {String crawlStorageFolder = "./tmp";int numberOfCrawlers = 5;CrawlConfig config = new CrawlConfig();// 文明请求web:确保我们不发送超过1每秒请求数(1000毫秒之间的请求)。config.setPolitenessDelay(1000);// 深度,即从入口URL开始算,URL是第几层。如入口A是1,从A中找到了B,B中又有C,则B是2,,C是3config.setMaxDepthOfCrawling(5);//设置最大的抓取页面数。默认值为1,页面的数量不限config.setMaxPagesToFetch(50);// 如果需要代理服务器的话//config.setProxyHost("proxyserver.example.com"); //设置代理域名//config.setProxyPort(8080);//端口// 如果代理服务器需要认证//config.setProxyUsername(username); config.getProxyPassword(password); //设置代理/** 此配置参数可以用来设置你的爬行是可恢复的(这意味着可以从先前中断/恢复爬行)* 注意:如果启用恢复特征,想开始一个新的抓取,你需要删除的内容手动rootfolder。*/config.setResumableCrawling(false);config.setCrawlStorageFolder(crawlStorageFolder);PageFetcher pageFetcher = new PageFetcher(config);RobotstxtConfig robotstxtConfig = new RobotstxtConfig();RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);controller.getCrawlersLocalData();controller.addSeed("");CommitConsumer consumer=new CommitConsumer();new Thread(consumer).start();controller.start(WomiCrawler.class, numberOfCrawlers);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}package crawler;import java.util.regex.Pattern;import org.apache.solr.common.SolrInputDocument;import edu.uci.ics.crawler4j.crawler.Page;import edu.uci.ics.crawler4j.crawler.WebCrawler;import edu.uci.ics.crawler4j.parser.HtmlParseData;import edu.uci.ics.crawler4j.url.WebURL;public class WomiCrawler extends WebCrawler{private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4"+ "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");//页面前缀private final static String URL_PREFIX = "";/** * shouldVisit是判断当前的URL是否已经应该被爬取(访问) */@Overridepublic boolean shouldVisit(WebURL url) {String href = url.getURL().toLowerCase();return !FILTERS.matcher(href).matches() && href.startsWith(URL_PREFIX);}/** * visit则是爬取该URL所指向的页面的数据,其传入的参数即是对该web页面全部数据的封装对象Page。 */@Overridepublic void visit(Page page) {try {SolrInputDocument doc=new SolrInputDocument();int docid = page.getWebURL().getDocid();String url = page.getWebURL().getURL();String parentUrl = page.getWebURL().getParentUrl();String anchor = page.getWebURL().getAnchor();doc.addField("id", docid+"");doc.addField("url", url+"");doc.addField("host", url+"");doc.addField("title", anchor+"");doc.addField("author", anchor+"");System.out.println("Docid: " + docid);System.out.println("URL: " + url);System.out.println("Parent page: " + parentUrl);System.out.println("anchor: " + anchor);if (page.getParseData() instanceof HtmlParseData) {HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();String text = htmlParseData.getText();doc.addField("content", text);}Lock lock = Lock.getInstance();lock.lstDocument.add(doc);lock.num++;System.out.println("爬虫次数: num ==" + lock.num);} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}部分对象锁代码package crawler;import java.util.concurrent.LinkedBlockingQueue;import org.apache.solr.common.SolrInputDocument;public class Lock {private static Lock lock ;public static Lock getInstance(){if(lock==null){synchronized (Lock.class) {if(lock==null){lock=new Lock();}}}return lock;}private Lock(){}//爬取page数量public int num = 0;//提交次数public int commitNum = 0;//索引数据集-消费者模式public LinkedBlockingQueue<SolrInputDocument> lstDocument = new LinkedBlockingQueue<SolrInputDocument>();}部分消费者代码:package crawler;import java.util.LinkedList;import java.util.List;import org.apache.solr.client.solrj.SolrServer;import org.apache.solr.common.SolrInputDocument;import search.solr.IndexerUtil;public class CommitConsumer implements Runnable {private SolrServer server = IndexerUtil.getHttpSolrServer("crawl");private List<SolrInputDocument> list=new LinkedList<SolrInputDocument>();private int commit=0;public void run() {try {SolrInputDocument doc=null;while((doc=Lock.getInstance().lstDocument.take())!=null){list.add(doc);if(list.size()==5){commit++;server.add(list);server.commit();list.clear();System.out.println("提交次数:"+commit);}}} catch (Exception e) {// TODO Auto-generated catch blocke.printStackTrace();}}}

积极的人在每一次忧患中都看到一个机会,

用生产者消费者模式实现爬虫批量提交网页给搜索引擎

相关文章:

你感兴趣的文章:

标签云: