http数据抓取Java实现的几种方式

前言:

  时下互联网第一波的浪潮已消逝,随着而来的基于万千数据的物联网时代,因而数据成为企业的重要战略资源之一。基于数据抓取技术,本文介绍了java相关抓取工具,并附上demo源码供感兴趣的朋友测试!

1)JDK自带HTTP连接,获取页面或Json

2) JDK自带URL连接,获取页面或Json

3)HttpClient Get工具,获取页面或Json

4)commons-io工具,获取页面或Json

5) Jsoup工具(通常用于html字段解析),获取页面,非Json返回格式】

完整代码:

package com.yeezhao.common.http;import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.io.IOUtils;import org.jsoup.Jsoup;/** * http工具对比 *  * @author Administrator -> junhong * *         2016年12月27日 */public class HttpFetchUtil {        /**     * 获取访问的状态码     * @param request     * @return     * @throws Exception     */    public static int getResponseCode(String request) throws Exception {        URL url = new URL(request);        HttpURLConnection conn = (HttpURLConnection) url.openConnection();        return conn.getResponseCode();    }    /**     * 1)JDK自带HTTP连接,获取页面或Json     * @param request     * @param charset     * @return     * @throws Exception     */    public static String JDKFetch(String request, String charset) throws Exception {        URL url = new URL(request);        HttpURLConnection conn = (HttpURLConnection) url.openConnection();        //模拟浏览器参数        conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36"                + " (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");        if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {            InputStream input = conn.getInputStream();            StringBuffer sb = new StringBuffer();            BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset));            String s;            while ((s = reader.readLine()) != null) {                sb.append(s + "\n");            }            input.close();            conn.disconnect();            return sb.toString();        }        return "";    }    /**     * 2) JDK自带URL连接,获取页面或Json     * @param request     * @param charset     * @return     * @throws Exception     */    public static String URLFetch(String request, String charset) throws Exception {        URL url = new URL(request);        return IOUtils.toString(url.openStream());    }    /**     * 3)HttpClient Get工具,获取页面或Json     * @param url     * @param charset     * @return     * @throws Exception     */    public static String httpClientFetch(String url, String charset) throws Exception {        // GET        HttpClient httpClient = new HttpClient();        httpClient.getParams().setContentCharset(charset);        HttpMethod method = new GetMethod(url);        httpClient.executeMethod(method);        return method.getResponseBodyAsString();    }    /**     * 4)commons-io工具,获取页面或Json     * @param url     * @param charset     * @return     * @throws Exception     */    public static String commonsIOFetch(String url, String charset) throws Exception {        return IOUtils.toString(new URL(url), charset);    }        /**     * 5) Jsoup工具(通常用于html字段解析),获取页面,非Json返回格式     * @param url     * @return     * @throws Exception     */    public static String jsoupFetch(String url) throws Exception {        return Jsoup.parse(new URL(url), 2 * 1000).html();    }}

测试代码:

package com.yeezhao.common.http;import org.junit.After;import org.junit.Before;import org.junit.Test;/** * 测试类 * 3个测试链接: * 1)百科网页 * 2)浏览器模拟获取接口数据 * 3)获取普通接口数据 * @author Administrator -> junhong * * 2016年12月27日 */public class HttpFetchUtilTest {    String seeds[] = {"http://baike.baidu.com/view/1.htm","http://m.ximalaya.com/tracks/26096131.json","http://remyapi.yeezhao.com/api/query?wd=%E5%91%A8%E6%98%9F%E9%A9%B0%E7%9A%84%E7%94%B5%E5%BD%B1"};    final static String DEFAULT_CHARSET = "UTF-8";    @Before    public void setUp() throws Exception {    }    @After    public void tearDown() throws Exception {        System.out.println("--- down ---");    }    @Test    public void testGetResponseCode() throws Exception{        for(String seed:seeds){            int responseCode = HttpFetchUtil.getResponseCode(seed);            System.out.println("ret="+responseCode);        }    }    @Test    public void testJDKFetch() throws Exception{        for(String seed:seeds){            String ret = HttpFetchUtil.JDKFetch(seed, DEFAULT_CHARSET);            System.out.println("ret="+ret);        }    }    @Test    public void testURLFetch() throws Exception{        for(String seed:seeds){            String ret = HttpFetchUtil.URLFetch(seed, DEFAULT_CHARSET);            System.out.println("ret="+ret);        }    }    @Test    public void testHttpClientFetch()throws Exception {        for(String seed:seeds){            String ret = HttpFetchUtil.httpClientFetch(seed, DEFAULT_CHARSET);            System.out.println("ret="+ret);        }    }    @Test    public void testCommonsIOFetch()throws Exception {        for(String seed:seeds){            String ret = HttpFetchUtil.commonsIOFetch(seed, DEFAULT_CHARSET);            System.out.println("ret="+ret);        }    }    @Test    public void testJsoupFetch() throws Exception{        for(String seed:seeds){            String ret = HttpFetchUtil.jsoupFetch(seed);            System.out.println("ret="+ret);        }    }}

附:相关jar依赖

...<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.7.3</version></dependency><dependency><groupId>commons-httpclient</groupId><artifactId>commons-httpclient</artifactId><version>3.1</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version></dependency>...

后语:  现在的数据时代,有着"数据即财富"的理念。因此,数据抓取技术将一直发展更新,基于此后续还将扩充针对POST方法的抓取方式,敬请期待!

以上就是http数据抓取Java实现的几种方式的详细内容,更多请关注其它相关文章!

一个人,一条路,人在途中,心随景动,

http数据抓取Java实现的几种方式

相关文章:

你感兴趣的文章:

标签云: