网络爬行者(SearchCrawler)源代码

import java.awt.*;import java.awt.event.*;import java.io.*;import java.net.*;import java.util.*;import java.util.regex.*;import javax.swing.*;import javax.swing.table.*;

//一个Web的爬行者(注:爬行在这里的意思与抓取,捕获相同)public class SearchCrawler extends JFrame{//最大URL保存值private static final String[] MAX_URLS={"50","100","500","1000"};//缓存robot禁止爬行列表private HashMap disallowListCache=new HashMap();//搜索GUI控件private JTextField startTextField;private JComboBox maxComboBox;private JCheckBox limitCheckBox;private JTextField logTextField;private JTextField searchTextField;private JCheckBox caseCheckBox;private JButton searchButton;//搜索状态GUI控件private JLabel crawlingLabel2;private JLabel crawledLabel2;private JLabel toCrawlLabel2;private JProgressBar progressBar;private JLabel matchesLabel2;//搜索匹配项表格列表private JTable table;//标记爬行机器是否正在爬行private boolean crawling;//写日志匹配文件的引用private PrintWriter logFileWriter;//网络爬行者的构造函数public SearchCrawler(){//设置应用程序标题栏setTitle("搜索爬行者");//设置窗体大小setSize(600,600);//处理窗体关闭事件addWindowListener(new WindowAdapter(){public void windowClosing(WindowEvent e){actionExit();}});//设置文件菜单JMenuBar menuBar=new JMenuBar();JMenu fileMenu=new JMenu("文件");fileMenu.setMnemonic(KeyEvent.VK_F);JMenuItem fileExitMenuItem=new JMenuItem("退出",KeyEvent.VK_X);fileExitMenuItem.addActionListener(new ActionListener(){public void actionPerformed(ActionEvent e){actionExit();}});fileMenu.add(fileExitMenuItem);menuBar.add(fileMenu);setJMenuBar(menuBar);//设置搜索面板JPanel searchPanel=new JPanel();GridBagConstraints constraints;GridBagLayout layout=new GridBagLayout();searchPanel.setLayout(layout);JLabel startLabel=new JLabel("开始URL:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(startLabel,constraints);searchPanel.add(startLabel);startTextField=new JTextField();constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,0,5);layout.setConstraints(startTextField,constraints);searchPanel.add(startTextField);JLabel maxLabel=new JLabel("最大抓取URL数(0表示不限制):");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(maxLabel,constraints);searchPanel.add(maxLabel);maxComboBox=new JComboBox(MAX_URLS);maxComboBox.setEditable(true);constraints=new GridBagConstraints();constraints.insets=new Insets(5,5,0,0);layout.setConstraints(maxComboBox,constraints);searchPanel.add(maxComboBox);limitCheckBox=new JCheckBox("限制抓取开始URL站点");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.WEST;constraints.insets=new Insets(0,10,0,0);layout.setConstraints(limitCheckBox,constraints);searchPanel.add(limitCheckBox);JLabel blankLabel=new JLabel();constraints=new GridBagConstraints();constraints.gridwidth=GridBagConstraints.REMAINDER;layout.setConstraints(blankLabel,constraints);searchPanel.add(blankLabel);JLabel logLabel=new JLabel("匹配日志文件:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(logLabel,constraints);searchPanel.add(logLabel);String file=System.getProperty("user.dir")+System.getProperty("file.separator")+"crawler.log";logTextField=new JTextField(file);constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,0,5);layout.setConstraints(logTextField,constraints);searchPanel.add(logTextField);JLabel searchLabel=new JLabel("搜索字符串:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(searchLabel,constraints);searchPanel.add(searchLabel);searchTextField=new JTextField();constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.insets=new Insets(5,5,0,0);constraints.gridwidth=2;constraints.weightx=1.0d;layout.setConstraints(searchTextField,constraints);searchPanel.add(searchTextField);caseCheckBox=new JCheckBox("大小写敏感");constraints=new GridBagConstraints();constraints.insets=new Insets(5,5,0,5);constraints.gridwidth=GridBagConstraints.REMAINDER;layout.setConstraints(caseCheckBox,constraints);searchPanel.add(caseCheckBox);searchButton=new JButton("搜索");searchButton.addActionListener(new ActionListener(){public void actionPerformed(ActionEvent e){actionSearch();}});constraints=new GridBagConstraints();constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,5,5);layout.setConstraints(searchButton,constraints);searchPanel.add(searchButton);JSeparator separator=new JSeparator();constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,5,5);layout.setConstraints(separator,constraints);searchPanel.add(separator);JLabel crawlingLabel1=new JLabel("爬行:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(crawlingLabel1,constraints);searchPanel.add(crawlingLabel1);crawlingLabel2=new JLabel();crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,0,5);layout.setConstraints(crawlingLabel2,constraints);searchPanel.add(crawlingLabel2);JLabel crawledLabel1=new JLabel("已抓取的URL数:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(crawledLabel1,constraints);searchPanel.add(crawledLabel1);crawledLabel2=new JLabel();crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,0,5);layout.setConstraints(crawledLabel2,constraints);searchPanel.add(crawledLabel2);JLabel toCrawlLabel1=new JLabel("爬行的URL数");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(toCrawlLabel1,constraints);searchPanel.add(toCrawlLabel1);toCrawlLabel2=new JLabel();toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,0,5);layout.setConstraints(toCrawlLabel2,constraints);searchPanel.add(toCrawlLabel2);JLabel progressLabel=new JLabel("正在爬行进度:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,0,0);layout.setConstraints(progressLabel,constraints);searchPanel.add(progressLabel);progressBar=new JProgressBar();progressBar.setMinimum(0);progressBar.setStringPainted(true);constraints=new GridBagConstraints();constraints.gridwidth=GridBagConstraints.HORIZONTAL;constraints.insets=new Insets(5,5,0,5);layout.setConstraints(progressBar,constraints);searchPanel.add(progressBar);JLabel matchesLabel1=new JLabel("搜索匹配:");constraints=new GridBagConstraints();constraints.anchor=GridBagConstraints.EAST;constraints.insets=new Insets(5,5,10,0);layout.setConstraints(matchesLabel1,constraints);searchPanel.add(matchesLabel1);matchesLabel2=new JLabel();matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));constraints=new GridBagConstraints();constraints.fill=GridBagConstraints.HORIZONTAL;constraints.gridwidth=GridBagConstraints.REMAINDER;constraints.insets=new Insets(5,5,10,5);layout.setConstraints(matchesLabel2,constraints);searchPanel.add(matchesLabel2);//设置匹配表table=new JTable(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){public boolean isCellEditable(int row,int column){return false;}});//设置匹配面板JPanel matchesPanel=new JPanel();matchesPanel.setBorder(BorderFactory.createTitledBorder("匹配"));matchesPanel.setLayout(new BorderLayout());matchesPanel.add(new JScrollPane(table),BorderLayout.CENTER);//把面板添加到窗体上getContentPane().setLayout(new BorderLayout());getContentPane().add(searchPanel,BorderLayout.NORTH);getContentPane().add(matchesPanel,BorderLayout.CENTER);}//处理搜索/停止按钮被点到private void actionSearch(){//如果停止按钮被点到,爬行标志关闭if(crawling){crawling=false;return;}ArrayList errorList=new ArrayList();//验证起始URL已经输入String startUrl=startTextField.getText().trim();if(startUrl.length()<1){errorList.add("没有起始URL");}else if(verifyUrl(startUrl)==null){//校验起始URLerrorList.add("非法的起始URL");}//校验最大URL数是否为空或者是一个数字int maxUrls=0;String max=((String)maxComboBox.getSelectedItem()).trim();if(max.length()>0){try{maxUrls=Integer.parseInt(max);}catch(NumberFormatException e){}if(maxUrls<1){errorList.add("非法最大URL数值");}}//验证匹配的日志文件已经键入String logFile=logTextField.getText().trim();if(logFile.length()<0){errorList.add("未填写日志文件");}//验证搜索字符串已经被键入String searchString=searchTextField.getText().trim();if(searchString.length()<1){errorList.add("未填写搜索字符串");}//如果有错,显示这些错误,然后返回if(errorList.size()>0){StringBuffer message=new StringBuffer();//连接所有的错误到一个字符串中for(int i=0;i<errorList.size();i++){message.append(errorList.get(i));if(i+1<errorList.size()){message.append("/n");}}showError(message.toString());return;}//从起始URL移除"www"startUrl=removeWwwFromUrl(startUrl);//启动搜索爬行者search(logFile,startUrl,maxUrls,searchString);}private void search(final String logFile,final String startUrl,final int maxUrls,final String searchString){//在一个新线程里开始搜索Thread thread=new Thread(new Runnable(){public void run(){//当搜索正在进行时,换一个等待鼠标setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));//禁用搜索控制面板startTextField.setEnabled(false);maxComboBox.setEnabled(false);limitCheckBox.setEnabled(false);logTextField.setEnabled(false);searchTextField.setEnabled(false);caseCheckBox.setEnabled(false);//更改搜索按钮为"停止"searchButton.setText("停止");//重设状态table.setModel(new DefaultTableModel(new Object[][]{},new String[]{"URL"}){public boolean isCellEditable(int row,int column){return false;}});updateStats(startUrl,0,0,maxUrls);//打开匹配日志文件try{logFileWriter=new PrintWriter(new FileWriter(logFile));}catch(Exception e){showError("不能打开匹配日志文件");return;}//打开正在爬行标志crawling=true;//执行真正的爬行crawl(startUrl,maxUrls,limitCheckBox.isSelected(),searchString,caseCheckBox.isSelected());//关闭正在爬行标志crawling=false;//关闭匹配日志文件try{logFileWriter.close();}catch(Exception e){showError("不能关闭匹配日志文件");}//标记搜索结束crawlingLabel2.setText("结束");//重新使搜索面板可用startTextField.setEnabled(true);maxComboBox.setEnabled(true);limitCheckBox.setEnabled(true);logTextField.setEnabled(true);searchTextField.setEnabled(true);caseCheckBox.setEnabled(true);//将搜索按钮改回"搜索"searchButton.setText("搜索");//改回默认的鼠标形状setCursor(Cursor.getDefaultCursor());//如果搜索字符串未被发现显示一个信息if(table.getRowCount()==0){JOptionPane.showMessageDialog(SearchCrawler.this,"你的搜索字符串未被发现,请尝试其它","搜索字符串未被发现",JOptionPane.WARNING_MESSAGE);}}});thread.start();}//退出程序private void actionExit(){System.exit(0);}//校验URL格式private URL verifyUrl(String url ){//只允许HTTP的URLif(!url.toLowerCase().startsWith("")){return null;}//校验URL的格式URL verifiedUrl=null;try{verifiedUrl=new URL(url);}catch(Exception e){return null;}return verifiedUrl;}//添加匹配到匹配表和日志文件private void addMatch(String url){//添加URL到匹配表DefaultTableModel model=(DefaultTableModel)table.getModel();model.addRow(new Object[]{url});//添加URL到日志文件try{logFileWriter.println(url);}catch(Exception e){showError("未成功的日志匹配");}}//更新爬行中状态private void updateStats(String crawling,int crawled,int toCrawl,int maxUrls){crawlingLabel2.setText(crawling);crawledLabel2.setText(""+crawled);toCrawlLabel2.setText(""+toCrawl);//更新进度条if(maxUrls==-1){progressBar.setMaximum(crawled+toCrawl);}else{progressBar.setMaximum(maxUrls);}progressBar.setValue(crawled);matchesLabel2.setText(""+table.getRowCount());}//检查机器人是否允许访问获得的URLprivate boolean isRobotAllowed(URL urlToCheck){String host=urlToCheck.getHost().toLowerCase();//从缓冲中找回服务器的不被允许列表ArrayList disallowList=(ArrayList)disallowListCache.get(host);//如果列表不在名单中,下载将它收入列表if(disallowList==null){disallowList=new ArrayList();try{URL robotsFileUrl=new URL(""+host+"/robots.txt");//打开并读取robot文件BufferedReader reader=new BufferedReader(new InputStreamReader(robotsFileUrl.openStream()));//读robot文件,建立不被允许路径列表String line;while((line=reader.readLine())!=null){if(line.indexOf("Disallow:")==0){String disallowPath=line.substring("Disallow:".length());//检查不被允许路径中如果含有注释则去除它int commentIndex=disallowPath.indexOf("#");if(commentIndex!=-1){disallowPath=disallowPath.substring(0,commentIndex);}//移除不被允许路径前后空格disallowPath=disallowPath.trim();//添加不被允许路径到列表中disallowList.add(disallowPath);}}}catch(Exception e){//假设当robot文件不存在时,所有的路径都将被允许爬行return true;}}//循环检查列表中是否包含给定的URLString file=urlToCheck.getFile();for(int i=0;i<disallowList.size();i++){String disallow=(String)disallowList.get(i);if(file.startsWith(disallow)){return false;}}return true;}//下载给定的URL页private String downloadPage(URL pageUrl){try{//为读取打开一个到URL的连接BufferedReader reader=new BufferedReader(new InputStreamReader(pageUrl.openStream()));//读文件到缓冲中String line;StringBuffer pageBuffer=new StringBuffer();while((line=reader.readLine())!=null){pageBuffer.append(line);}return pageBuffer.toString();}catch(Exception e){}return null;}//从一个URL中删除开头的"www",如果它存在private String removeWwwFromUrl(String url){int index=url.indexOf("://www");if(index!=-1){return url.substring(0,index+3)+url.substring(index+7);}return url;}//解析所有的页面内容找到链接private ArrayList retrieveLinks(URL pageUrl,String pageContents,HashSet crawledList,boolean limitHost){//编译链接匹配模式Pattern p=Pattern.compile("<a//s+href//s*=//s*/"?(.*?)[/"|>]",Pattern.CASE_INSENSITIVE);Matcher m=p.matcher(pageContents);//建立链接匹配列表ArrayList linkList=new ArrayList();while(m.find()){String link=m.group(1).trim();//跳过空链接if(link.length()<1){continue;}//跳过页面锚记链接if(link.charAt(0)==’#’){continue;}//跳过邮件链接if(link.indexOf("mailto:")!=-1){continue;}//跳过JavaScript链接if(link.toLowerCase().indexOf("javascript")!=-1){continue;}//如果需要,加上绝对与相对URLif(link.indexOf("://")==-1){//处理绝对URLif(link.charAt(0)==’/’){link=""+pageUrl.getHost()+link;//处理相对URL}else{String file=pageUrl.getFile();if(file.indexOf(‘/’)==-1){link=""+pageUrl.getHost()+"/"+link;}else{String path=file.substring(0,file.lastIndexOf(‘/’)+1);link=""+pageUrl.getHost()+path+link;}}}//从链接移除锚记int index=link.indexOf(‘#’);if(index!=-1){link=link.substring(0,index);}//去除开头的"www"link=removeWwwFromUrl(link);//校验链接,如果非法,则跳过URL verifiedLink=verifyUrl(link);if(verifiedLink==null){continue;}//如果是特定的,那些与起始相同的服务器的链接,则跳过if(limitHost && !pageUrl.getHost().toLowerCase().equals(verifiedLink.getHost().toLowerCase())){continue;}//如果它已经被捕获,则跳过if(crawledList.contains(link)){continue;}//添加链接到列表linkList.add(link);}return linkList;}//决定获得的页面内容里是否有匹配的字符串private boolean searchStringMatches(String pageContents,String searchString,boolean caseSensitive){String searchContents=pageContents;//如果是非大小写敏感,,小写所有页面内容if(!caseSensitive){searchContents=pageContents.toLowerCase();}//从个别的队列中分隔字符串Pattern p=Pattern.compile("[//s]+");String[] terms=p.split(searchString);//检查每一个队列是否匹配for(int i=0;i<terms.length;i++){if(caseSensitive){if(searchContents.indexOf(terms[i])==-1){return false;}}else{if(searchContents.indexOf(terms[i].toLowerCase())==-1){return false;}}}return false;}//执行真正的爬行,搜索搜索字符串public void crawl(String startUrl,int maxUrls,boolean limitHost,String searchString,boolean caseSensitive){//设置爬行列表HashSet crawledList=new HashSet();LinkedHashSet toCrawlList=new LinkedHashSet();//添加开始URL到要爬行列表toCrawlList.add(startUrl);//循环整个要爬行列表,执行真正的爬行while(crawling && toCrawlList.size()>0){//如果指定过最大URL数,则检查是否达到了最大URL数if(maxUrls!=-1){if(crawledList.size()==maxUrls){break;}}//从底部的列表中获得URLString url=(String)toCrawlList.iterator().next();//从要爬行列表中移除URLtoCrawlList.remove(url);//转换字符串URL为URL对象URL verifiedUrl=verifyUrl(url);//如果robots不允许访问这个URL,则跳过if(!isRobotAllowed(verifiedUrl)){continue;}//更新爬行状态updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);//添加页面到被爬行的列表crawledList.add(url);//从获得的URL下载页面String pageContents=downloadPage(verifiedUrl);//如果一个页面被下载成功,则找到所有的链接并比较是否包含搜索字符串if(pageContents!=null&&pageContents.length()>0){//从页面获得合法的链接ArrayList links=retrieveLinks(verifiedUrl,pageContents,crawledList,limitHost);//添加列表到被爬行列表toCrawlList.addAll(links);//检查搜索字符串是否存在,如果存在,则记录一个匹配if(searchStringMatches(pageContents,searchString,caseSensitive)){addMatch(url);}}//更新爬行状态updateStats(url,crawledList.size(),toCrawlList.size(),maxUrls);}}//显示错误信息private void showError(String message){JOptionPane.showMessageDialog(this,message,"错误",JOptionPane.ERROR_MESSAGE);}public static void main(String[] args){SearchCrawler crawler=new SearchCrawler();crawler.show();}}

唯有斯人面上簌簌流下的,是点点无声无行的热泪。

网络爬行者(SearchCrawler)源代码

相关文章:

你感兴趣的文章:

标签云: