java下载网页并读取内容 – Anonymous的博客

下载回来怎么也得读取内容:

package com.core.crawl;import java.io.IOException;import com.util.file.Files;public class Crawl {    /**     * @param args     * @throws IOException      * @throws InterruptedException      */    public static void main(String[] args) throws IOException, InterruptedException {long begin = System.currentTimeMillis();//WebSpider spider2 = new WebSpider();WebSpider spider1 = new WebSpider();spider1.setWebAddress("http://www.w3c.org/robots.txt");spider1.setDestFile(Files.getSysPath() + "/"+"robots.");//spider2.setWebAddress("http://blog.csdn.net/longronglin");//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");Thread t1 = new Thread(spider1);//Thread t2 = new Thread(spider2);t1.start();//t2.start();t1.join();//t2.join();System.out.println("the end");System.out.println(System.currentTimeMillis() - begin);    }    }

package com.core.crawl;import java.io.BufferedReader;import java.io.DataInputStream;import java.io.File;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import com.core.http.Http;public class WebSpider implements Runnable{        private Http http = new Http();    private String webAddress = "";    private String destFile = "";        public void setWebAddress(String webAddress){this.webAddress = webAddress;    }        public void setDestFile (String destFile){this.destFile = destFile;    }        public boolean download() throws IOException, InterruptedException {HttpURLConnection httpConn = null;try {    URL url = new URL(webAddress);      httpConn = (HttpURLConnection) url.openConnection();    httpConn.setRequestMethod("GET");    httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");    InputStream in = httpConn.getInputStream();    String fileType = http.fileType(httpConn.getContentType());    System.out.println(fileType);    FileOutputStream out = new FileOutputStream(new File(destFile + fileType));    int chByte = in.read();    BufferedReader bf = new BufferedReader(new InputStreamReader(in));    String result = null;    while ((result = bf.readLine()) != null) {System.out.println(result);    }//            while (chByte != -1) {//out.write(chByte);////System.out.println(chByte);//chByte = in.read();//    }            } catch (Exception ex) {    System.out.println(ex.toString());} finally {    httpConn.disconnect();}return true;    }    public void run() {try {    System.out.println(Thread.currentThread().getName());    download();    } catch (IOException e) {    e.printStackTrace();} catch (InterruptedException e) {    e.printStackTrace();}    }}

package com.util.file;public class Files {    /***     * 获取应用程序的根目录     * @return 应用程序根目录     */    public static String getSysPath(){return  System.getProperty("user.dir");     }    }

results:

Thread-0html

# robots.txt for http://www.w3.org/## $Id: robots.txt,v 1.50 2007/12/13 17:09:37 ted Exp $#

# For use by search.w3.orgUser-agent: W3C-gsaDisallow: /Out-Of-Date

User-agent: W3T_SEDisallow: /Out-Of-Date

User-agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS Search 4.0 Robot)Disallow: /

# W3C Link checkerUser-agent: W3C-checklinkDisallow:

# exclude some access-controlled areasUser-agent: *Disallow: /2004/ontaria/basicDisallow: /TeamDisallow: /ProjectDisallow: /WebDisallow: /SystemsDisallow: /HistoryDisallow: /Out-Of-DateDisallow: /2002/02/midDisallow: /mid/Disallow: /2004/08/W3CTalksDisallow: /2007/11/Talks/searchDisallow: /People/all/Disallow: /RDF/Validator/ARPServletDisallow: /2003/03/Translations/byLanguageDisallow: /2003/03/Translations/byTechnologyDisallow: /2005/11/Translations/QueryDisallow: /2003/glossary/subglossary/#Disallow: /2005/06/blog/#Disallow: /2001/07/pubrules-checker#shouldnt get transparent proxies but will ml links of things like pubrulesDisallow: /2000/06/webdata/xsltDisallow: /2000/09/webdata/xsltDisallow: /2005/08/online_xslt/xsltDisallow: /Bugs/Disallow: /Search/Mail/Public/Disallow: /2006/02/chartergenthe end10485

spider1.setWebAddress("http://www.w3c.org/");spider1.setDestFile(Files.getSysPath() + "/"+"w3c.");的设置自己测试

别小看任何人,越不起眼的人。往往会做些让人想不到的事。

java下载网页并读取内容 – Anonymous的博客

相关文章:

你感兴趣的文章:

标签云: