lucene搜索之正则表达式查询RegExQuery和手机邮箱查询示例

今天快下班的时候收到了一个群友的问题,大意是读取文本文件中的内容,找出文件中的手机号和邮箱,我自己写了一个读取文档的内容的正则查询示例,用于匹配文件中是否含有邮箱或者手机号,这个等于是对之前的文本处理工具的一个梳理,同时结合lucene内部提供的正则匹配查询RegexQuery;

废话不多说了,直接上代码,这里先对文件内容读取分类处理,分为pdf word excel 和普通文本四类,不同的种类读取文本内容不一样

pdf利用pdfbox读取内容,word和excel利用poi进行读取内容,,文本文档利用jdk自带的读取

读取pdf、word、excel和普通文本文档内容(支持word excel 2007)package com.lucene.index.util;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import java.nio.charset.Charset;import java.nio.file.Files;import java.nio.file.Paths;import java.util.LinkedList;import java.util.List;import org.apache.pdfbox.PDFReader;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.util.PDFTextStripper;import org.apache.poi.EncryptedDocumentException;import org.apache.poi.POIXMLDocument;import org.apache.poi.POIXMLTextExtractor;import org.apache.poi.hssf.usermodel.HSSFCell;import org.apache.poi.hssf.usermodel.HSSFRow;import org.apache.poi.hwpf.extractor.WordExtractor;import org.apache.poi.openxml4j.exceptions.InvalidFormatException;import org.apache.poi.openxml4j.exceptions.OpenXML4JException;import org.apache.poi.openxml4j.opc.OPCPackage;import org.apache.poi.ss.usermodel.Sheet;import org.apache.poi.ss.usermodel.Workbook;import org.apache.poi.ss.usermodel.WorkbookFactory;import org.apache.poi.xssf.usermodel.XSSFRow;import org.apache.poi.xssf.usermodel.XSSFSheet;import org.apache.poi.xssf.usermodel.XSSFWorkbook;import org.apache.poi.xwpf.extractor.XWPFWordExtractor;import org.apache.xmlbeans.XmlException;import com.lucene.bean.FileBean;public class FileUtil {/**读取文件信息和下属文件夹 * @param folder * @return * @throws IOException * @throws OpenXML4JException * @throws XmlException */public static List<FileBean> getFolderFiles(String folder) throws Exception {List<FileBean> fileBeans = new LinkedList<FileBean>();File file = new File(folder);if(file.isDirectory()){File[] files = file.listFiles();if(files != null){for (File file2 : files) {fileBeans.addAll(getFolderFiles(file2.getAbsolutePath()));}}}else{FileBean bean = new FileBean();String filePath = file.getAbsolutePath();bean.setPath(file.getAbsolutePath());bean.setModified(file.lastModified());String content = "";if(filePath.endsWith(".doc") || filePath.endsWith(".docx")){content = readDoc(file);}else if(filePath.endsWith(".xls") || filePath.endsWith(".xlsx")){content = readExcel(file);}else if(filePath.endsWith(".pdf")){content = readPdf(file);}else{content = new String(Files.readAllBytes(Paths.get(folder)));}bean.setContent(content);fileBeans.add(bean);}return fileBeans;}/**讀取excel文件 * @param file * @return * @throws IOException * @throws InvalidFormatException * @throws EncryptedDocumentException */private static String readExcel(File file) throws Exception {String filePath = file.getAbsolutePath();StringBuffer content = new StringBuffer("");if(filePath.endsWith(".xls")){InputStream inp = new FileInputStream(filePath);Workbook wb = WorkbookFactory.create(inp);Sheet sheet = wb.getSheetAt(0);for(int i = sheet.getFirstRowNum();i<= sheet.getPhysicalNumberOfRows();i++){HSSFRow row = (HSSFRow) sheet.getRow(i);if (row == null) {continue;}for (int j = row.getFirstCellNum(); j <= row.getLastCellNum(); j++) {HSSFCell cell = row.getCell(j);if (cell == null) {continue;}content.append(cell.getStringCellValue());}}wb.close();inp.close();}else{XSSFWorkbook xwb = new XSSFWorkbook(file.getAbsolutePath());XSSFSheet sheet = xwb.getSheetAt(0);// 定义 row、cellXSSFRow row;String cell;// 循环输出表格中的内容for (int i = sheet.getFirstRowNum(); i < sheet.getPhysicalNumberOfRows(); i++) {row = sheet.getRow(i);for (int j = row.getFirstCellNum(); j < row.getPhysicalNumberOfCells(); j++) {// 通过 row.getCell(j).toString() 获取单元格内容,cell = row.getCell(j).getRawValue();content.append(cell+" ");}}}return content.toString();}/**讀取word內容 * @param file * @return * @throws IOException * @throws OpenXML4JException * @throws XmlException */private static String readDoc(File file) throws IOException, XmlException, OpenXML4JException {String filePath = file.getAbsolutePath();if(filePath.endsWith(".doc")){InputStream is = new FileInputStream(file);WordExtractor ex = new WordExtractor(is);String text2003 = ex.getText();ex.close();is.close();return text2003;}else{OPCPackage opcPackage = POIXMLDocument.openPackage(filePath);POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);String text2007 = extractor.getText();extractor.close();return text2007;}}/**讀取pdf內容 * @param file * @return * @throws IOException */public static String readPdf(File file) throws IOException{PDDocument doc = PDDocument.load(file.getAbsolutePath());PDFTextStripper stripper = new PDFTextStripper();String content = stripper.getText(doc);doc.close();return content;}}正则查询query构建你的选择是做或不做,但不做就永远不会有机会

lucene搜索之正则表达式查询RegExQuery和手机邮箱查询示例

相关文章:

你感兴趣的文章:

标签云: