java抓取网站数据,存放到Excel中

今天工作需要做了个在上抓取数据的任务啊!

步骤为:1、先弄清抓取网站的url规则;2、设置正则表达式抓取规则;3、存放到同一个Excel中

代码如下:import java.io.File;import java.io.FileOutputStream;import java.io.InputStream;import java.net.HttpURLConnection;import java.net.URL;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hssf.usermodel.HSSFSheet;import org.apache.poi.hssf.usermodel.HSSFWorkbook;public class GetChemicalbookSiteCompanyUtil { public static void main(String[] args) { try{ String tUrl = ""; String strUrl = ""; HSSFWorkbook hssfworkbook = new HSSFWorkbook();HSSFSheet hssfsheet = hssfworkbook.createSheet("公司基本信息"); //公司网址索引a-zfor (int j = 36; j <= 61; j++) {//61tUrl = "";for (int i = 0; i < 700; i=i+100) {tUrl = strUrl+j+"_"+i+".htm";System.out.println(tUrl);String pageData = "";int len;URL url = new URL(tUrl);HttpURLConnection url_con = null;url_con = (HttpURLConnection) url.openConnection();url_con.setFollowRedirects(true);url_con.setInstanceFollowRedirects(false);url_con.setRequestMethod("GET");if(url_con.getResponseCode()==200){InputStream in = url_con.getInputStream();byte[] by = new byte[1024];while((len=in.read(by))!=-1){pageData +=new String(by,0,len);}in.close();url_con.disconnect();//设置抓取规则String regEx ="<tr><tdwidth=\&;230\&;>(.+?)</td>" +"<tdwidth=\&;230\&;>(.+?)</td>"+"<tdwidth=\&;210\&;>(.+?)</td>"+"<tdwidth=\&;200\&;>(.+?)</td>"+"<tdwidth=\&;80\&;>(.+?)</td></tr>";String companyName = null;String tel = null;String email = null;String website = null;String country = null;pageData=pageData.replaceAll("\\s|\\t|\\r", "");Matcher mat=Pattern.compile(regEx).matcher(pageData);while(mat.find()){companyName = mat.group(1).replaceAll("<a.*?>|</a>", "");tel = mat.group(2).replaceAll("<span.*?>|</span>", "");email = mat.group(3).replaceAll("<span.*?>|</span>", "");website = mat.group(4).replaceAll("<a.*?>|</a>", "");country = mat.group(5).replaceAll("<span.*?>|</span>", "");if(country.equals("中国")){HSSFRow hssfrow = hssfsheet.createRow(hssfsheet.getLastRowNum()+1);hssfrow.createCell((short)0).setCellValue(companyName);hssfrow.createCell((short)1).setCellValue(email);hssfrow.createCell((short)2).setCellValue(tel);hssfrow.createCell((short)3).setCellValue(website);hssfrow.createCell((short)4).setCellValue(country);url_con.disconnect();}}url_con.disconnect();}//将数据导入到excel中File fileName = new File("F:\\材料\\chemicalbook_DBinfo.xls");if(!fileName.exists()){fileName.mkdir();}FileOutputStream output = new FileOutputStream(fileName);hssfworkbook.write(output);output.close();} } }catch (Exception e) {e.printStackTrace();} }}

,看着它或是汹涌或是平静,然而一直相随,不离不弃。

java抓取网站数据,存放到Excel中

相关文章:

你感兴趣的文章:

标签云: