Solr 集成ANSJ 中文分词插件

代码片段(3)[全屏查看所有代码]

1.[代码]ANSJTokenizer类,具体分词package org.ansj.solr; import java.io.FileOutputStream;import java.io.FileWriter;import java.io.IOException;import java.io.OutputStreamWriter;import java.io.PrintWriter;import java.io.Reader;import java.io.Writer;import org.apache.lucene.analysis.Tokenizer;import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;import org.apache.lucene.analysis.tokenattributes.TypeAttribute;import org.apache.lucene.util.AttributeFactory;import org.ansj.domain.Term;import org.ansj.splitWord.Analysis;import org.ansj.splitWord.analysis.ToAnalysis;public final class ANSJTokenizer extends Tokenizer {Analysis udf = null;private int offset = 0, bufferIndex=0, dataLen=0;private final static int MAX_WORD_LEN = 255;private final static int IO_BUFFER_SIZE = 1024;private final char[] buffer = new char[MAX_WORD_LEN];private final char[] ioBuffer = new char[IO_BUFFER_SIZE];private int length;private int start;private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);private final TypeAttribute typeAtt = (TypeAttribute)addAttribute(TypeAttribute.class);public ANSJTokenizer(Reader in) {super(in);}public ANSJTokenizer(AttributeFactory factory, Reader in) {super(factory, in);}public Analysis getAnalysis(){udf = new ToAnalysis(input);return udf;}private final boolean flush() {if (length>0) {//System.out.println(new String(buffer, 0,//length));termAtt.copyBuffer(buffer, 0, length);offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));return true;}elsereturn false;}@Overridepublic boolean incrementToken() throws IOException {clearAttributes();if(udf == null){udf = getAnalysis();}Term term = udf.next();if(term != null) {termAtt.copyBuffer(term.getName().toCharArray(), 0, term.getName().length());//if the text has newline, then the first word in the newly started line will have start position 1, which will//cause the if condition failed, in original code, it can cause serious problem.if(term.getTo().getOffe() < term.getOffe() || term.getTo().getOffe() < 0){offsetAtt.setOffset(term.getOffe(), term.getOffe() +term.getName().length());typeAtt.setType(“word”);return true;}else{offsetAtt.setOffset(term.getOffe(), term.getTo().getOffe());typeAtt.setType(“word”);return true;}} else {end();return false;}}@Overridepublic final void end() throws IOException {super.end();// set final offsetfinal int finalOffset = correctOffset(offset);this.offsetAtt.setOffset(finalOffset, finalOffset);}@Overridepublic void reset() throws IOException {super.reset();offset = bufferIndex = dataLen = 0;udf = new ToAnalysis(input);}@Overridepublic void close() throws IOException {super.close();offset = bufferIndex = dataLen = 0;}}

2.[代码]NSJTokenizerFactory工厂类package org.ansj.solr;import org.apache.lucene.analysis.util.TokenizerFactory;import org.apache.lucene.util.AttributeFactory;import java.io.IOException;import java.io.Reader;import java.util.Map;public class ANSJTokenizerFactory extends TokenizerFactory {private ThreadLocal<ANSJTokenizer> tokenizerLocal = new ThreadLocal<ANSJTokenizer>();/** Creates a new ANSJTokenizerFactory */ public ANSJTokenizerFactory(Map<String,String> args) {super(args);assureMatchVersion();if (!args.isEmpty()) {throw new IllegalArgumentException(“Unknown parameters: ” + args);} }@Override public ANSJTokenizer create(AttributeFactory factory, Reader input) {ANSJTokenizer tokenizer = tokenizerLocal.get();if(tokenizer == null) {tokenizer = newTokenizer(factory, input);}try {tokenizer.setReader(input);} catch (IOException e) {tokenizer = newTokenizer(factory, input);}return tokenizer;}private ANSJTokenizer newTokenizer(AttributeFactory factory, Reader input) {ANSJTokenizer tokenizer = new ANSJTokenizer(factory, input);tokenizerLocal.set(tokenizer);return tokenizer;}}

3.[代码]Solr的Schema文件中的filedType类型定义<fieldType positionIncrementGap=”100″><analyzer type=”index”><tokenizer/><filter ignoreCase=”true” words=”stopwords.txt” enablePositionIncrements=”true” /><filter/></analyzer><analyzer type=”query”><tokenizer/><filter ignoreCase=”true” words=”stopwords.txt” enablePositionIncrements=”true” /><filter/></analyzer></fieldType>

,你爱我吗?已经爱到危险的程度了.危险到什么程度?已经不能一个人生活。

Solr 集成ANSJ 中文分词插件

相关文章:

你感兴趣的文章:

标签云: