fix: lucene用户词典文件创建失败
- 优化代码注释排版
This commit is contained in:
parent
59171fdb9b
commit
b4e31775f8
@ -7,7 +7,6 @@ import com.rymcu.forest.core.result.GlobalResultGenerator;
|
|||||||
import com.rymcu.forest.lucene.model.UserDic;
|
import com.rymcu.forest.lucene.model.UserDic;
|
||||||
import com.rymcu.forest.lucene.service.UserDicService;
|
import com.rymcu.forest.lucene.service.UserDicService;
|
||||||
import com.rymcu.forest.util.Utils;
|
import com.rymcu.forest.util.Utils;
|
||||||
import lombok.extern.log4j.Log4j2;
|
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.*;
|
||||||
|
|
||||||
import javax.annotation.Resource;
|
import javax.annotation.Resource;
|
||||||
|
@ -30,24 +30,16 @@ public class DefaultConfig implements Configuration {
|
|||||||
|
|
||||||
/** 分词器默认字典路径 */
|
/** 分词器默认字典路径 */
|
||||||
private static final String PATH_DIC_MAIN = "lucene/main2012.dic";
|
private static final String PATH_DIC_MAIN = "lucene/main2012.dic";
|
||||||
|
/** 题词字典路径 */
|
||||||
private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic";
|
private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic";
|
||||||
|
/** 用户自定义字典路径 */
|
||||||
private static final String PATH_USER_DIC =
|
private static final String PATH_USER_DIC =
|
||||||
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
|
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
|
||||||
|
/** 配置属性——扩展字典 */
|
||||||
/** 分词器配置文件路径 */
|
|
||||||
private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
|
|
||||||
// 配置属性——扩展字典
|
|
||||||
private static final String EXT_DICT = "ext_dic";
|
|
||||||
// 配置属性——扩展停止词典
|
|
||||||
private static final String EXT_STOP = "ext_stopword";
|
|
||||||
|
|
||||||
private String extDic = "lucene/ext.dic;" + PATH_USER_DIC;
|
private String extDic = "lucene/ext.dic;" + PATH_USER_DIC;
|
||||||
|
/** 配置属性——扩展停止词典 */
|
||||||
private String extStopword = "lucene/stopword.dic";
|
private String extStopword = "lucene/stopword.dic";
|
||||||
/*
|
/** 是否使用smart方式分词 */
|
||||||
* 是否使用smart方式分词
|
|
||||||
*/
|
|
||||||
private boolean useSmart;
|
private boolean useSmart;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -138,5 +130,4 @@ public class DefaultConfig implements Configuration {
|
|||||||
}
|
}
|
||||||
return extStopWordDictFiles;
|
return extStopWordDictFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,30 +1,24 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
|
|
||||||
import com.rymcu.forest.lucene.cfg.Configuration;
|
import com.rymcu.forest.lucene.cfg.Configuration;
|
||||||
import com.rymcu.forest.lucene.dic.Dictionary;
|
import com.rymcu.forest.lucene.dic.Dictionary;
|
||||||
|
|
||||||
@ -32,53 +26,42 @@ import java.io.IOException;
|
|||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/** 分词器上下文状态 */
|
||||||
*
|
|
||||||
* 分词器上下文状态
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
class AnalyzeContext {
|
class AnalyzeContext {
|
||||||
|
|
||||||
// 默认缓冲区大小
|
/** 默认缓冲区大小 */
|
||||||
private static final int BUFF_SIZE = 4096;
|
private static final int BUFF_SIZE = 4096;
|
||||||
// 缓冲区耗尽的临界值
|
/** 缓冲区耗尽的临界值 */
|
||||||
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
private static final int BUFF_EXHAUST_CRITICAL = 100;
|
||||||
|
/** 字符窜读取缓冲 */
|
||||||
// 字符窜读取缓冲
|
|
||||||
private char[] segmentBuff;
|
private char[] segmentBuff;
|
||||||
// 字符类型数组
|
/** 字符类型数组 */
|
||||||
private int[] charTypes;
|
private int[] charTypes;
|
||||||
|
/** 记录Reader内已分析的字串总长度, 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 */
|
||||||
// 记录Reader内已分析的字串总长度
|
|
||||||
// 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
|
|
||||||
private int buffOffset;
|
private int buffOffset;
|
||||||
// 当前缓冲区位置指针
|
/** 当前缓冲区位置指针 */
|
||||||
private int cursor;
|
private int cursor;
|
||||||
// 最近一次读入的,可处理的字串长度
|
/** 最近一次读入的,可处理的字串长度 */
|
||||||
private int available;
|
private int available;
|
||||||
|
/** 子分词器锁, 该集合非空,说明有子分词器在占用segmentBuff */
|
||||||
// 子分词器锁
|
private final Set<String> buffLocker;
|
||||||
// 该集合非空,说明有子分词器在占用segmentBuff
|
/** 原始分词结果集合,未经歧义处理 */
|
||||||
private Set<String> buffLocker;
|
|
||||||
|
|
||||||
// 原始分词结果集合,未经歧义处理
|
|
||||||
private QuickSortSet orgLexemes;
|
private QuickSortSet orgLexemes;
|
||||||
// LexemePath位置索引表
|
/** LexemePath位置索引表 */
|
||||||
private Map<Integer, LexemePath> pathMap;
|
private final Map<Integer, LexemePath> pathMap;
|
||||||
// 最终分词结果集
|
/** 最终分词结果集 */
|
||||||
private LinkedList<Lexeme> results;
|
private final LinkedList<Lexeme> results;
|
||||||
|
/** 分词器配置项 */
|
||||||
// 分词器配置项
|
private final Configuration cfg;
|
||||||
private Configuration cfg;
|
|
||||||
|
|
||||||
public AnalyzeContext(Configuration cfg) {
|
public AnalyzeContext(Configuration cfg) {
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
this.segmentBuff = new char[BUFF_SIZE];
|
this.segmentBuff = new char[BUFF_SIZE];
|
||||||
this.charTypes = new int[BUFF_SIZE];
|
this.charTypes = new int[BUFF_SIZE];
|
||||||
this.buffLocker = new HashSet<String>();
|
this.buffLocker = new HashSet<>();
|
||||||
this.orgLexemes = new QuickSortSet();
|
this.orgLexemes = new QuickSortSet();
|
||||||
this.pathMap = new HashMap<Integer, LexemePath>();
|
this.pathMap = new HashMap<>();
|
||||||
this.results = new LinkedList<Lexeme>();
|
this.results = new LinkedList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
int getCursor() {
|
int getCursor() {
|
||||||
@ -102,10 +85,11 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 根据context的上下文情况,填充segmentBuff
|
* 根据context的上下文情况,填充segmentBuff
|
||||||
|
*
|
||||||
* @param reader
|
* @param reader
|
||||||
* @return 返回待分析的(有效的)字串长度
|
* @return 返回待分析的(有效的)字串长度
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
int fillBuffer(Reader reader) throws IOException {
|
int fillBuffer(Reader reader) throws IOException {
|
||||||
int readCount = 0;
|
int readCount = 0;
|
||||||
@ -129,20 +113,14 @@ class AnalyzeContext {
|
|||||||
return readCount;
|
return readCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 初始化buff指针,处理第一个字符 */
|
||||||
* 初始化buff指针,处理第一个字符
|
|
||||||
*/
|
|
||||||
void initCursor() {
|
void initCursor() {
|
||||||
this.cursor = 0;
|
this.cursor = 0;
|
||||||
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
|
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
|
||||||
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 指针+1 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 并处理当前字符 */
|
||||||
* 指针+1
|
|
||||||
* 成功返回 true; 指针已经到了buff尾部,不能前进,返回false
|
|
||||||
* 并处理当前字符
|
|
||||||
*/
|
|
||||||
boolean moveCursor() {
|
boolean moveCursor() {
|
||||||
if (this.cursor < this.available - 1) {
|
if (this.cursor < this.available - 1) {
|
||||||
this.cursor++;
|
this.cursor++;
|
||||||
@ -155,8 +133,8 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 设置当前segmentBuff为锁定状态
|
* 设置当前segmentBuff为锁定状态 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
|
||||||
* 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
|
*
|
||||||
* @param segmenterName
|
* @param segmenterName
|
||||||
*/
|
*/
|
||||||
void lockBuffer(String segmenterName) {
|
void lockBuffer(String segmenterName) {
|
||||||
@ -165,6 +143,7 @@ class AnalyzeContext {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 移除指定的子分词器名,释放对segmentBuff的占用
|
* 移除指定的子分词器名,释放对segmentBuff的占用
|
||||||
|
*
|
||||||
* @param segmenterName
|
* @param segmenterName
|
||||||
*/
|
*/
|
||||||
void unlockBuffer(String segmenterName) {
|
void unlockBuffer(String segmenterName) {
|
||||||
@ -172,8 +151,8 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 只要buffLocker中存在segmenterName
|
* 只要buffLocker中存在segmenterName 则buffer被锁定
|
||||||
* 则buffer被锁定
|
*
|
||||||
* @return boolean 缓冲去是否被锁定
|
* @return boolean 缓冲去是否被锁定
|
||||||
*/
|
*/
|
||||||
boolean isBufferLocked() {
|
boolean isBufferLocked() {
|
||||||
@ -181,8 +160,8 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断当前segmentBuff是否已经用完
|
* 判断当前segmentBuff是否已经用完 当前执针cursor移至segmentBuff末端this.available - 1
|
||||||
* 当前执针cursor移至segmentBuff末端this.available - 1
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
boolean isBufferConsumed() {
|
boolean isBufferConsumed() {
|
||||||
@ -191,28 +170,28 @@ class AnalyzeContext {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断segmentBuff是否需要读取新数据
|
* 判断segmentBuff是否需要读取新数据
|
||||||
*
|
*
|
||||||
* 满足一下条件时,
|
* <p>满足一下条件时, 1.available == BUFF_SIZE 表示buffer满载 2.buffIndex < available - 1 && buffIndex >
|
||||||
* 1.available == BUFF_SIZE 表示buffer满载
|
* available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 3.!context.isBufferLocked()表示没有segmenter在占用buffer
|
||||||
* 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
|
|
||||||
* 3.!context.isBufferLocked()表示没有segmenter在占用buffer
|
|
||||||
* 要中断当前循环(buffer要进行移位,并再读取数据的操作)
|
* 要中断当前循环(buffer要进行移位,并再读取数据的操作)
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
boolean needRefillBuffer() {
|
boolean needRefillBuffer() {
|
||||||
return this.available == BUFF_SIZE && this.cursor < this.available - 1
|
return this.available == BUFF_SIZE
|
||||||
&& this.cursor > this.available - BUFF_EXHAUST_CRITICAL && !this.isBufferLocked();
|
&& this.cursor < this.available - 1
|
||||||
|
&& this.cursor > this.available - BUFF_EXHAUST_CRITICAL
|
||||||
|
&& !this.isBufferLocked();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 累计当前的segmentBuff相对于reader起始位置的位移 */
|
||||||
* 累计当前的segmentBuff相对于reader起始位置的位移
|
|
||||||
*/
|
|
||||||
void markBufferOffset() {
|
void markBufferOffset() {
|
||||||
this.buffOffset += this.cursor;
|
this.buffOffset += this.cursor;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 向分词结果集添加词元
|
* 向分词结果集添加词元
|
||||||
|
*
|
||||||
* @param lexeme
|
* @param lexeme
|
||||||
*/
|
*/
|
||||||
void addLexeme(Lexeme lexeme) {
|
void addLexeme(Lexeme lexeme) {
|
||||||
@ -220,8 +199,8 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 添加分词结果路径
|
* 添加分词结果路径 路径起始位置 ---> 路径 映射表
|
||||||
* 路径起始位置 ---> 路径 映射表
|
*
|
||||||
* @param path
|
* @param path
|
||||||
*/
|
*/
|
||||||
void addLexemePath(LexemePath path) {
|
void addLexemePath(LexemePath path) {
|
||||||
@ -232,6 +211,7 @@ class AnalyzeContext {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 返回原始分词结果
|
* 返回原始分词结果
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
QuickSortSet getOrgLexemes() {
|
QuickSortSet getOrgLexemes() {
|
||||||
@ -239,14 +219,12 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 推送分词结果到结果集合
|
* 推送分词结果到结果集合 1.从buff头部遍历到this.cursor已处理位置 2.将map中存在的分词结果推入results
|
||||||
* 1.从buff头部遍历到this.cursor已处理位置
|
|
||||||
* 2.将map中存在的分词结果推入results
|
|
||||||
* 3.将map中不存在的CJDK字符以单字方式推入results
|
* 3.将map中不存在的CJDK字符以单字方式推入results
|
||||||
*/
|
*/
|
||||||
void outputToResult() {
|
void outputToResult() {
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (; index <= this.cursor;) {
|
while (index <= this.cursor) {
|
||||||
// 跳过非CJK字符
|
// 跳过非CJK字符
|
||||||
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
|
||||||
index++;
|
index++;
|
||||||
@ -269,7 +247,7 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {// pathMap中找不到index对应的LexemePath
|
} else { // pathMap中找不到index对应的LexemePath
|
||||||
// 单字输出
|
// 单字输出
|
||||||
this.outputSingleCJK(index);
|
this.outputSingleCJK(index);
|
||||||
index++;
|
index++;
|
||||||
@ -281,6 +259,7 @@ class AnalyzeContext {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 对CJK字符进行单字输出
|
* 对CJK字符进行单字输出
|
||||||
|
*
|
||||||
* @param index
|
* @param index
|
||||||
*/
|
*/
|
||||||
private void outputSingleCJK(int index) {
|
private void outputSingleCJK(int index) {
|
||||||
@ -294,9 +273,10 @@ class AnalyzeContext {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 返回lexeme
|
* 返回lexeme
|
||||||
*
|
*
|
||||||
* 同时处理合并
|
* <p>同时处理合并
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
Lexeme getNextLexeme() {
|
Lexeme getNextLexeme() {
|
||||||
@ -305,8 +285,8 @@ class AnalyzeContext {
|
|||||||
while (result != null) {
|
while (result != null) {
|
||||||
// 数量词合并
|
// 数量词合并
|
||||||
this.compound(result);
|
this.compound(result);
|
||||||
if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(),
|
if (Dictionary.getSingleton()
|
||||||
result.getLength())) {
|
.isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
|
||||||
// 是停止词继续取列表的下一个
|
// 是停止词继续取列表的下一个
|
||||||
result = this.results.pollFirst();
|
result = this.results.pollFirst();
|
||||||
} else {
|
} else {
|
||||||
@ -318,9 +298,7 @@ class AnalyzeContext {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 重置分词上下文状态 */
|
||||||
* 重置分词上下文状态
|
|
||||||
*/
|
|
||||||
void reset() {
|
void reset() {
|
||||||
this.buffLocker.clear();
|
this.buffLocker.clear();
|
||||||
this.orgLexemes = new QuickSortSet();
|
this.orgLexemes = new QuickSortSet();
|
||||||
@ -333,9 +311,7 @@ class AnalyzeContext {
|
|||||||
this.pathMap.clear();
|
this.pathMap.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 组合词元 */
|
||||||
* 组合词元
|
|
||||||
*/
|
|
||||||
private void compound(Lexeme result) {
|
private void compound(Lexeme result) {
|
||||||
if (!this.cfg.useSmart()) {
|
if (!this.cfg.useSmart()) {
|
||||||
return;
|
return;
|
||||||
@ -372,8 +348,6 @@ class AnalyzeContext {
|
|||||||
this.results.pollFirst();
|
this.results.pollFirst();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,55 +1,42 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import com.rymcu.forest.lucene.dic.Dictionary;
|
import com.rymcu.forest.lucene.dic.Dictionary;
|
||||||
import com.rymcu.forest.lucene.dic.Hit;
|
import com.rymcu.forest.lucene.dic.Hit;
|
||||||
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/** 中文-日韩文子分词器 */
|
||||||
* 中文-日韩文子分词器
|
|
||||||
*/
|
|
||||||
class CJKSegmenter implements ISegmenter {
|
class CJKSegmenter implements ISegmenter {
|
||||||
|
|
||||||
// 子分词器标签
|
/** 子分词器标签 */
|
||||||
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
|
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
|
||||||
// 待处理的分词hit队列
|
/** 待处理的分词hit队列 */
|
||||||
private List<Hit> tmpHits;
|
private List<Hit> tmpHits;
|
||||||
|
|
||||||
CJKSegmenter() {
|
CJKSegmenter() {
|
||||||
this.tmpHits = new LinkedList<Hit>();
|
this.tmpHits = new LinkedList<Hit>();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
|
||||||
@ -59,15 +46,20 @@ class CJKSegmenter implements ISegmenter {
|
|||||||
// 处理词段队列
|
// 处理词段队列
|
||||||
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
|
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
|
||||||
for (Hit hit : tmpArray) {
|
for (Hit hit : tmpArray) {
|
||||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(),
|
hit =
|
||||||
context.getCursor(), hit);
|
Dictionary.getSingleton()
|
||||||
|
.matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
|
||||||
if (hit.isMatch()) {
|
if (hit.isMatch()) {
|
||||||
// 输出当前的词
|
// 输出当前的词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(),
|
Lexeme newLexeme =
|
||||||
context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
hit.getBegin(),
|
||||||
|
context.getCursor() - hit.getBegin() + 1,
|
||||||
|
Lexeme.TYPE_CNWORD);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
|
if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除
|
||||||
this.tmpHits.remove(hit);
|
this.tmpHits.remove(hit);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,14 +70,14 @@ class CJKSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// *********************************
|
|
||||||
// 再对当前指针位置的字符进行单字匹配
|
// 再对当前指针位置的字符进行单字匹配
|
||||||
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(),
|
Hit singleCharHit =
|
||||||
context.getCursor(), 1);
|
Dictionary.getSingleton()
|
||||||
if (singleCharHit.isMatch()) {// 首字成词
|
.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||||
|
if (singleCharHit.isMatch()) { // 首字成词
|
||||||
// 输出当前的词
|
// 输出当前的词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1,
|
Lexeme newLexeme =
|
||||||
Lexeme.TYPE_CNWORD);
|
new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
// 同时也是词前缀
|
// 同时也是词前缀
|
||||||
@ -93,7 +85,7 @@ class CJKSegmenter implements ISegmenter {
|
|||||||
// 前缀匹配则放入hit列表
|
// 前缀匹配则放入hit列表
|
||||||
this.tmpHits.add(singleCharHit);
|
this.tmpHits.add(singleCharHit);
|
||||||
}
|
}
|
||||||
} else if (singleCharHit.isPrefix()) {// 首字为词前缀
|
} else if (singleCharHit.isPrefix()) { // 首字为词前缀
|
||||||
// 前缀匹配则放入hit列表
|
// 前缀匹配则放入hit列表
|
||||||
this.tmpHits.add(singleCharHit);
|
this.tmpHits.add(singleCharHit);
|
||||||
}
|
}
|
||||||
@ -119,14 +111,9 @@ class CJKSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see org.wltea.analyzer.core.ISegmenter#reset()
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() {
|
public void reset() {
|
||||||
// 清空队列
|
// 清空队列
|
||||||
this.tmpHits.clear();
|
this.tmpHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,30 +1,24 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
|
|
||||||
import com.rymcu.forest.lucene.dic.Dictionary;
|
import com.rymcu.forest.lucene.dic.Dictionary;
|
||||||
import com.rymcu.forest.lucene.dic.Hit;
|
import com.rymcu.forest.lucene.dic.Hit;
|
||||||
|
|
||||||
@ -33,19 +27,17 @@ import java.util.LinkedList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
/** 中文数量词子分词器 */
|
||||||
/**
|
|
||||||
*
|
|
||||||
* 中文数量词子分词器
|
|
||||||
*/
|
|
||||||
class CN_QuantifierSegmenter implements ISegmenter {
|
class CN_QuantifierSegmenter implements ISegmenter {
|
||||||
|
|
||||||
// 子分词器标签
|
/** 子分词器标签 */
|
||||||
static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
|
||||||
|
|
||||||
// 中文数词
|
/** 中文数词 */
|
||||||
private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";// Cnum
|
private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
|
||||||
private static Set<Character> ChnNumberChars = new HashSet<Character>();
|
|
||||||
|
private static Set<Character> ChnNumberChars = new HashSet<>();
|
||||||
|
|
||||||
static {
|
static {
|
||||||
char[] ca = Chn_Num.toCharArray();
|
char[] ca = Chn_Num.toCharArray();
|
||||||
for (char nChar : ca) {
|
for (char nChar : ca) {
|
||||||
@ -53,17 +45,13 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
|
||||||
* 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
|
|
||||||
*/
|
|
||||||
private int nStart;
|
private int nStart;
|
||||||
/*
|
/** 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束 */
|
||||||
* 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束
|
|
||||||
*/
|
|
||||||
private int nEnd;
|
private int nEnd;
|
||||||
|
|
||||||
// 待处理的量词hit队列
|
/** 待处理的量词hit队列 */
|
||||||
private List<Hit> countHits;
|
private final List<Hit> countHits;
|
||||||
|
|
||||||
CN_QuantifierSegmenter() {
|
CN_QuantifierSegmenter() {
|
||||||
nStart = -1;
|
nStart = -1;
|
||||||
@ -71,16 +59,13 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
this.countHits = new LinkedList<Hit>();
|
this.countHits = new LinkedList<Hit>();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 分词 */
|
||||||
* 分词
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
// 处理中文数词
|
// 处理中文数词
|
||||||
this.processCNumber(context);
|
this.processCNumber(context);
|
||||||
// 处理中文量词
|
// 处理中文量词
|
||||||
this.processCount(context);
|
this.processCount(context);
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
|
if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
|
||||||
// 对缓冲区解锁
|
// 对缓冲区解锁
|
||||||
@ -90,9 +75,7 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 重置子分词器状态 */
|
||||||
* 重置子分词器状态
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() {
|
public void reset() {
|
||||||
nStart = -1;
|
nStart = -1;
|
||||||
@ -100,18 +83,16 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
countHits.clear();
|
countHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 处理数词 */
|
||||||
* 处理数词
|
|
||||||
*/
|
|
||||||
private void processCNumber(AnalyzeContext context) {
|
private void processCNumber(AnalyzeContext context) {
|
||||||
if (nStart == -1 && nEnd == -1) {// 初始状态
|
if (nStart == -1 && nEnd == -1) { // 初始状态
|
||||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||||
&& ChnNumberChars.contains(context.getCurrentChar())) {
|
&& ChnNumberChars.contains(context.getCurrentChar())) {
|
||||||
// 记录数词的起始、结束位置
|
// 记录数词的起始、结束位置
|
||||||
nStart = context.getCursor();
|
nStart = context.getCursor();
|
||||||
nEnd = context.getCursor();
|
nEnd = context.getCursor();
|
||||||
}
|
}
|
||||||
} else {// 正在处理状态
|
} else { // 正在处理状态
|
||||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
|
||||||
&& ChnNumberChars.contains(context.getCurrentChar())) {
|
&& ChnNumberChars.contains(context.getCurrentChar())) {
|
||||||
// 记录数词的结束位置
|
// 记录数词的结束位置
|
||||||
@ -139,6 +120,7 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 处理中文量词
|
* 处理中文量词
|
||||||
|
*
|
||||||
* @param context
|
* @param context
|
||||||
*/
|
*/
|
||||||
private void processCount(AnalyzeContext context) {
|
private void processCount(AnalyzeContext context) {
|
||||||
@ -146,23 +128,26 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
if (!this.needCountScan(context)) {
|
if (!this.needCountScan(context)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
|
||||||
|
|
||||||
// 优先处理countHits中的hit
|
// 优先处理countHits中的hit
|
||||||
if (!this.countHits.isEmpty()) {
|
if (!this.countHits.isEmpty()) {
|
||||||
// 处理词段队列
|
// 处理词段队列
|
||||||
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
|
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
|
||||||
for (Hit hit : tmpArray) {
|
for (Hit hit : tmpArray) {
|
||||||
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(),
|
hit =
|
||||||
context.getCursor(), hit);
|
Dictionary.getSingleton()
|
||||||
|
.matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
|
||||||
if (hit.isMatch()) {
|
if (hit.isMatch()) {
|
||||||
// 输出当前的词
|
// 输出当前的词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(),
|
Lexeme newLexeme =
|
||||||
context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
hit.getBegin(),
|
||||||
|
context.getCursor() - hit.getBegin() + 1,
|
||||||
|
Lexeme.TYPE_COUNT);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
|
if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除
|
||||||
this.countHits.remove(hit);
|
this.countHits.remove(hit);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,33 +157,29 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// *********************************
|
|
||||||
// 对当前指针位置的字符进行单字匹配
|
// 对当前指针位置的字符进行单字匹配
|
||||||
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(),
|
Hit singleCharHit =
|
||||||
context.getCursor(), 1);
|
Dictionary.getSingleton()
|
||||||
if (singleCharHit.isMatch()) {// 首字成量词词
|
.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
|
||||||
|
if (singleCharHit.isMatch()) { // 首字成量词词
|
||||||
// 输出当前的词
|
// 输出当前的词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1,
|
Lexeme newLexeme =
|
||||||
Lexeme.TYPE_COUNT);
|
new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
// 同时也是词前缀
|
// 同时也是词前缀
|
||||||
if (singleCharHit.isPrefix()) {
|
if (singleCharHit.isPrefix()) {
|
||||||
// 前缀匹配则放入hit列表
|
// 前缀匹配则放入hit列表
|
||||||
this.countHits.add(singleCharHit);
|
this.countHits.add(singleCharHit);
|
||||||
}
|
}
|
||||||
} else if (singleCharHit.isPrefix()) {// 首字为量词前缀
|
} else if (singleCharHit.isPrefix()) { // 首字为量词前缀
|
||||||
// 前缀匹配则放入hit列表
|
// 前缀匹配则放入hit列表
|
||||||
this.countHits.add(singleCharHit);
|
this.countHits.add(singleCharHit);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// 输入的不是中文字符
|
// 输入的不是中文字符
|
||||||
// 清空未成形的量词
|
// 清空未成形的量词
|
||||||
this.countHits.clear();
|
this.countHits.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// 缓冲区数据已经读完,还有尚未输出的量词
|
// 缓冲区数据已经读完,还有尚未输出的量词
|
||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
// 清空未成形的量词
|
// 清空未成形的量词
|
||||||
@ -208,6 +189,7 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断是否需要扫描量词
|
* 判断是否需要扫描量词
|
||||||
|
*
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private boolean needCountScan(AnalyzeContext context) {
|
private boolean needCountScan(AnalyzeContext context) {
|
||||||
@ -230,16 +212,15 @@ class CN_QuantifierSegmenter implements ISegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 添加数词词元到结果集
|
* 添加数词词元到结果集
|
||||||
|
*
|
||||||
* @param context
|
* @param context
|
||||||
*/
|
*/
|
||||||
private void outputNumLexeme(AnalyzeContext context) {
|
private void outputNumLexeme(AnalyzeContext context) {
|
||||||
if (nStart > -1 && nEnd > -1) {
|
if (nStart > -1 && nEnd > -1) {
|
||||||
// 输出数词
|
// 输出数词
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1,
|
Lexeme newLexeme =
|
||||||
Lexeme.TYPE_CNUM);
|
new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,34 +1,27 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
*
|
||||||
*
|
* <p>字符集识别工具类
|
||||||
* 字符集识别工具类
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
/**
|
/** 字符集识别工具类 */
|
||||||
*
|
|
||||||
* 字符集识别工具类
|
|
||||||
*/
|
|
||||||
class CharacterUtil {
|
class CharacterUtil {
|
||||||
|
|
||||||
public static final int CHAR_USELESS = 0;
|
public static final int CHAR_USELESS = 0;
|
||||||
@ -43,6 +36,7 @@ class CharacterUtil {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 识别字符类型
|
* 识别字符类型
|
||||||
|
*
|
||||||
* @param input
|
* @param input
|
||||||
* @return int CharacterUtil定义的字符类型常量
|
* @return int CharacterUtil定义的字符类型常量
|
||||||
*/
|
*/
|
||||||
@ -72,7 +66,6 @@ class CharacterUtil {
|
|||||||
|| ub == Character.UnicodeBlock.KATAKANA // 片假名
|
|| ub == Character.UnicodeBlock.KATAKANA // 片假名
|
||||||
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
|
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
|
||||||
return CHAR_OTHER_CJK;
|
return CHAR_OTHER_CJK;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 其他的不做处理的字符
|
// 其他的不做处理的字符
|
||||||
@ -81,6 +74,7 @@ class CharacterUtil {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 进行字符规格化(全角转半角,大写转小写处理)
|
* 进行字符规格化(全角转半角,大写转小写处理)
|
||||||
|
*
|
||||||
* @param input
|
* @param input
|
||||||
* @return char
|
* @return char
|
||||||
*/
|
*/
|
||||||
|
@ -1,44 +1,36 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
import java.util.Stack;
|
import java.util.Stack;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
/**
|
/** IK分词歧义裁决器 */
|
||||||
* IK分词歧义裁决器
|
|
||||||
*/
|
|
||||||
class IKArbitrator {
|
class IKArbitrator {
|
||||||
|
|
||||||
IKArbitrator() {
|
IKArbitrator() {}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 分词歧义处理
|
* 分词歧义处理
|
||||||
* @param orgLexemes
|
*
|
||||||
|
* @param context
|
||||||
* @param useSmart
|
* @param useSmart
|
||||||
*/
|
*/
|
||||||
void process(AnalyzeContext context, boolean useSmart) {
|
void process(AnalyzeContext context, boolean useSmart) {
|
||||||
@ -84,9 +76,10 @@ class IKArbitrator {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 歧义识别
|
* 歧义识别
|
||||||
|
*
|
||||||
* @param lexemeCell 歧义路径链表头
|
* @param lexemeCell 歧义路径链表头
|
||||||
* @param fullTextLength 歧义路径文本长度
|
* @param fullTextLength 歧义路径文本长度
|
||||||
* @param option 候选结果路径
|
* @param fullTextLength 候选结果路径
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
|
private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
|
||||||
@ -114,12 +107,12 @@ class IKArbitrator {
|
|||||||
|
|
||||||
// 返回集合中的最优方案
|
// 返回集合中的最优方案
|
||||||
return pathOptions.first();
|
return pathOptions.first();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 向前遍历,添加词元,构造一个无歧义词元组合
|
* 向前遍历,添加词元,构造一个无歧义词元组合
|
||||||
* @param LexemePath path
|
*
|
||||||
|
* @param option path
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
|
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
|
||||||
@ -139,14 +132,13 @@ class IKArbitrator {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 回滚词元链,直到它能够接受指定的词元
|
* 回滚词元链,直到它能够接受指定的词元
|
||||||
* @param lexeme
|
*
|
||||||
|
* @param option
|
||||||
* @param l
|
* @param l
|
||||||
*/
|
*/
|
||||||
private void backPath(Lexeme l, LexemePath option) {
|
private void backPath(Lexeme l, LexemePath option) {
|
||||||
while (option.checkCross(l)) {
|
while (option.checkCross(l)) {
|
||||||
option.removeTail();
|
option.removeTail();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,25 +1,21 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
@ -32,30 +28,26 @@ import java.io.Reader;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/** IK分词器主类 */
|
||||||
* IK分词器主类
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public final class IKSegmenter {
|
public final class IKSegmenter {
|
||||||
|
|
||||||
// 字符窜reader
|
/** 字符窜reader */
|
||||||
private Reader input;
|
private Reader input;
|
||||||
// 分词器配置项
|
/** 分词器配置项 */
|
||||||
private Configuration cfg;
|
private Configuration cfg;
|
||||||
// 分词器上下文
|
/** 分词器上下文 */
|
||||||
private AnalyzeContext context;
|
private AnalyzeContext context;
|
||||||
// 分词处理器列表
|
/** 分词处理器列表 */
|
||||||
private List<ISegmenter> segmenters;
|
private List<ISegmenter> segmenters;
|
||||||
// 分词歧义裁决器
|
/** 分词歧义裁决器 */
|
||||||
private IKArbitrator arbitrator;
|
private IKArbitrator arbitrator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器构造函数
|
* IK分词器构造函数
|
||||||
* @param input
|
*
|
||||||
|
* @param input
|
||||||
* @param useSmart 为true,使用智能分词策略
|
* @param useSmart 为true,使用智能分词策略
|
||||||
*
|
* <p>非智能分词:细粒度输出所有可能的切分结果 智能分词: 合并数词和量词,对分词结果进行歧义判断
|
||||||
* 非智能分词:细粒度输出所有可能的切分结果
|
|
||||||
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
|
|
||||||
*/
|
*/
|
||||||
public IKSegmenter(Reader input, boolean useSmart) {
|
public IKSegmenter(Reader input, boolean useSmart) {
|
||||||
this.input = input;
|
this.input = input;
|
||||||
@ -66,9 +58,9 @@ public final class IKSegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* IK分词器构造函数
|
* IK分词器构造函数
|
||||||
|
*
|
||||||
* @param input
|
* @param input
|
||||||
* @param cfg 使用自定义的Configuration构造分词器
|
* @param cfg 使用自定义的Configuration构造分词器
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public IKSegmenter(Reader input, Configuration cfg) {
|
public IKSegmenter(Reader input, Configuration cfg) {
|
||||||
this.input = input;
|
this.input = input;
|
||||||
@ -76,9 +68,7 @@ public final class IKSegmenter {
|
|||||||
this.init();
|
this.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/** 初始化 */
|
||||||
* 初始化
|
|
||||||
*/
|
|
||||||
private void init() {
|
private void init() {
|
||||||
// 初始化词典单例
|
// 初始化词典单例
|
||||||
Dictionary.initial(this.cfg);
|
Dictionary.initial(this.cfg);
|
||||||
@ -92,6 +82,7 @@ public final class IKSegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 初始化词典,加载子分词器实现
|
* 初始化词典,加载子分词器实现
|
||||||
|
*
|
||||||
* @return List<ISegmenter>
|
* @return List<ISegmenter>
|
||||||
*/
|
*/
|
||||||
private List<ISegmenter> loadSegmenters() {
|
private List<ISegmenter> loadSegmenters() {
|
||||||
@ -107,6 +98,7 @@ public final class IKSegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 分词,获取下一个词元
|
* 分词,获取下一个词元
|
||||||
|
*
|
||||||
* @return Lexeme 词元对象
|
* @return Lexeme 词元对象
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
@ -152,9 +144,10 @@ public final class IKSegmenter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 重置分词器到初始状态
|
* 重置分词器到初始状态
|
||||||
* @param input
|
*
|
||||||
*/
|
* @param input
|
||||||
|
*/
|
||||||
public synchronized void reset(Reader input) {
|
public synchronized void reset(Reader input) {
|
||||||
this.input = input;
|
this.input = input;
|
||||||
context.reset();
|
context.reset();
|
||||||
|
@ -1,72 +1,46 @@
|
|||||||
/**
|
/**
|
||||||
* IK 中文分词 版本 5.0
|
* IK 中文分词 版本 5.0 IK Analyzer release 5.0
|
||||||
* IK Analyzer release 5.0
|
|
||||||
*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
*
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
* <p>Licensed to the Apache Software Foundation (ASF) under one or more contributor license
|
||||||
|
* agreements. See the NOTICE file distributed with this work for additional information regarding
|
||||||
|
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance with the License. You may obtain a
|
||||||
|
* copy of the License at
|
||||||
*
|
*
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
* <p>http://www.apache.org/licenses/LICENSE-2.0
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
*
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
* <p>Unless required by applicable law or agreed to in writing, software distributed under the
|
||||||
* See the License for the specific language governing permissions and
|
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||||||
|
* express or implied. See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*
|
*
|
||||||
* 源代码由林良益(linliangyi2005@gmail.com)提供
|
* <p>源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
|
||||||
* 版权声明 2012,乌龙茶工作室
|
* by Oolong studio
|
||||||
* provided by Linliangyi and copyright 2012 by Oolong studio
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
package com.rymcu.forest.lucene.core;
|
package com.rymcu.forest.lucene.core;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
/**
|
/** 英文字符及阿拉伯数字子分词器 */
|
||||||
*
|
|
||||||
* 英文字符及阿拉伯数字子分词器
|
|
||||||
*/
|
|
||||||
class LetterSegmenter implements ISegmenter {
|
class LetterSegmenter implements ISegmenter {
|
||||||
|
|
||||||
// 子分词器标签
|
/** 子分词器标签 */
|
||||||
static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
|
||||||
// 链接符号
|
/** 链接符号 */
|
||||||
private static final char[] Letter_Connector = new char[] { '#', '&', '+', '-', '.', '@', '_' };
|
private static final char[] Letter_Connector = new char[] {'#', '&', '+', '-', '.', '@', '_'};
|
||||||
|
/** 数字符号 */
|
||||||
// 数字符号
|
private static final char[] Num_Connector = new char[] {',', '.'};
|
||||||
private static final char[] Num_Connector = new char[] { ',', '.' };
|
/** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
|
||||||
|
|
||||||
/*
|
|
||||||
* 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
|
|
||||||
*/
|
|
||||||
private int start;
|
private int start;
|
||||||
/*
|
/** 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 */
|
||||||
* 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
|
|
||||||
*/
|
|
||||||
private int end;
|
private int end;
|
||||||
|
/** 字母起始位置 */
|
||||||
/*
|
|
||||||
* 字母起始位置
|
|
||||||
*/
|
|
||||||
private int englishStart;
|
private int englishStart;
|
||||||
|
/** 字母结束位置 */
|
||||||
/*
|
|
||||||
* 字母结束位置
|
|
||||||
*/
|
|
||||||
private int englishEnd;
|
private int englishEnd;
|
||||||
|
/** 阿拉伯数字起始位置 */
|
||||||
/*
|
|
||||||
* 阿拉伯数字起始位置
|
|
||||||
*/
|
|
||||||
private int arabicStart;
|
private int arabicStart;
|
||||||
|
/** 阿拉伯数字结束位置 */
|
||||||
/*
|
|
||||||
* 阿拉伯数字结束位置
|
|
||||||
*/
|
|
||||||
private int arabicEnd;
|
private int arabicEnd;
|
||||||
|
|
||||||
LetterSegmenter() {
|
LetterSegmenter() {
|
||||||
@ -80,10 +54,6 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
this.arabicEnd = -1;
|
this.arabicEnd = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void analyze(AnalyzeContext context) {
|
public void analyze(AnalyzeContext context) {
|
||||||
boolean bufferLockFlag = false;
|
boolean bufferLockFlag = false;
|
||||||
@ -103,10 +73,6 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see org.wltea.analyzer.core.ISegmenter#reset()
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public void reset() {
|
public void reset() {
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
@ -118,16 +84,15 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 处理数字字母混合输出
|
* 处理数字字母混合输出 如:windos2000 | linliangyi2005@gmail.com
|
||||||
* 如:windos2000 | linliangyi2005@gmail.com
|
*
|
||||||
* @param input
|
|
||||||
* @param context
|
* @param context
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private boolean processMixLetter(AnalyzeContext context) {
|
private boolean processMixLetter(AnalyzeContext context) {
|
||||||
boolean needLock = false;
|
boolean needLock = false;
|
||||||
|
|
||||||
if (this.start == -1) {// 当前的分词器尚未开始处理字符
|
if (this.start == -1) { // 当前的分词器尚未开始处理字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
||||||
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录起始指针的位置,标明分词器进入处理状态
|
// 记录起始指针的位置,标明分词器进入处理状态
|
||||||
@ -135,7 +100,7 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
this.end = start;
|
this.end = start;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {// 当前的分词器正在处理字符
|
} else { // 当前的分词器正在处理字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|
||||||
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录下可能的结束位置
|
// 记录下可能的结束位置
|
||||||
@ -147,8 +112,12 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
this.end = context.getCursor();
|
this.end = context.getCursor();
|
||||||
} else {
|
} else {
|
||||||
// 遇到非Letter字符,输出词元
|
// 遇到非Letter字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start
|
Lexeme newLexeme =
|
||||||
+ 1, Lexeme.TYPE_LETTER);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
this.start,
|
||||||
|
this.end - this.start + 1,
|
||||||
|
Lexeme.TYPE_LETTER);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
this.end = -1;
|
this.end = -1;
|
||||||
@ -159,8 +128,12 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.start != -1 && this.end != -1) {
|
if (this.start != -1 && this.end != -1) {
|
||||||
// 缓冲以读完,输出词元
|
// 缓冲以读完,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start
|
Lexeme newLexeme =
|
||||||
+ 1, Lexeme.TYPE_LETTER);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
this.start,
|
||||||
|
this.end - this.start + 1,
|
||||||
|
Lexeme.TYPE_LETTER);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.start = -1;
|
this.start = -1;
|
||||||
this.end = -1;
|
this.end = -1;
|
||||||
@ -168,37 +141,38 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
if (this.start == -1 && this.end == -1) {
|
// 对缓冲区解锁
|
||||||
// 对缓冲区解锁
|
needLock = this.start != -1 || this.end != -1;
|
||||||
needLock = false;
|
|
||||||
} else {
|
|
||||||
needLock = true;
|
|
||||||
}
|
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 处理纯英文字母输出
|
* 处理纯英文字母输出
|
||||||
|
*
|
||||||
* @param context
|
* @param context
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private boolean processEnglishLetter(AnalyzeContext context) {
|
private boolean processEnglishLetter(AnalyzeContext context) {
|
||||||
boolean needLock = false;
|
boolean needLock = false;
|
||||||
|
|
||||||
if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符
|
if (this.englishStart == -1) { // 当前的分词器尚未开始处理英文字符
|
||||||
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录起始指针的位置,标明分词器进入处理状态
|
// 记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.englishStart = context.getCursor();
|
this.englishStart = context.getCursor();
|
||||||
this.englishEnd = this.englishStart;
|
this.englishEnd = this.englishStart;
|
||||||
}
|
}
|
||||||
} else {// 当前的分词器正在处理英文字符
|
} else { // 当前的分词器正在处理英文字符
|
||||||
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
|
||||||
// 记录当前指针位置为结束位置
|
// 记录当前指针位置为结束位置
|
||||||
this.englishEnd = context.getCursor();
|
this.englishEnd = context.getCursor();
|
||||||
} else {
|
} else {
|
||||||
// 遇到非English字符,输出词元
|
// 遇到非English字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd
|
Lexeme newLexeme =
|
||||||
- this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
this.englishStart,
|
||||||
|
this.englishEnd - this.englishStart + 1,
|
||||||
|
Lexeme.TYPE_ENGLISH);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.englishStart = -1;
|
this.englishStart = -1;
|
||||||
this.englishEnd = -1;
|
this.englishEnd = -1;
|
||||||
@ -209,8 +183,12 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.englishStart != -1 && this.englishEnd != -1) {
|
if (this.englishStart != -1 && this.englishEnd != -1) {
|
||||||
// 缓冲以读完,输出词元
|
// 缓冲以读完,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd
|
Lexeme newLexeme =
|
||||||
- this.englishStart + 1, Lexeme.TYPE_ENGLISH);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
this.englishStart,
|
||||||
|
this.englishEnd - this.englishStart + 1,
|
||||||
|
Lexeme.TYPE_ENGLISH);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.englishStart = -1;
|
this.englishStart = -1;
|
||||||
this.englishEnd = -1;
|
this.englishEnd = -1;
|
||||||
@ -218,30 +196,27 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
if (this.englishStart == -1 && this.englishEnd == -1) {
|
// 对缓冲区解锁
|
||||||
// 对缓冲区解锁
|
needLock = this.englishStart != -1 || this.englishEnd != -1;
|
||||||
needLock = false;
|
|
||||||
} else {
|
|
||||||
needLock = true;
|
|
||||||
}
|
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 处理阿拉伯数字输出
|
* 处理阿拉伯数字输出
|
||||||
|
*
|
||||||
* @param context
|
* @param context
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private boolean processArabicLetter(AnalyzeContext context) {
|
private boolean processArabicLetter(AnalyzeContext context) {
|
||||||
boolean needLock = false;
|
boolean needLock = false;
|
||||||
|
|
||||||
if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符
|
if (this.arabicStart == -1) { // 当前的分词器尚未开始处理数字字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
||||||
// 记录起始指针的位置,标明分词器进入处理状态
|
// 记录起始指针的位置,标明分词器进入处理状态
|
||||||
this.arabicStart = context.getCursor();
|
this.arabicStart = context.getCursor();
|
||||||
this.arabicEnd = this.arabicStart;
|
this.arabicEnd = this.arabicStart;
|
||||||
}
|
}
|
||||||
} else {// 当前的分词器正在处理数字字符
|
} else { // 当前的分词器正在处理数字字符
|
||||||
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
|
||||||
// 记录当前指针位置为结束位置
|
// 记录当前指针位置为结束位置
|
||||||
this.arabicEnd = context.getCursor();
|
this.arabicEnd = context.getCursor();
|
||||||
@ -250,8 +225,12 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
// 不输出数字,但不标记结束
|
// 不输出数字,但不标记结束
|
||||||
} else {
|
} else {
|
||||||
// //遇到非Arabic字符,输出词元
|
// //遇到非Arabic字符,输出词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd
|
Lexeme newLexeme =
|
||||||
- this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
this.arabicStart,
|
||||||
|
this.arabicEnd - this.arabicStart + 1,
|
||||||
|
Lexeme.TYPE_ARABIC);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.arabicStart = -1;
|
this.arabicStart = -1;
|
||||||
this.arabicEnd = -1;
|
this.arabicEnd = -1;
|
||||||
@ -262,8 +241,12 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
if (context.isBufferConsumed()) {
|
if (context.isBufferConsumed()) {
|
||||||
if (this.arabicStart != -1 && this.arabicEnd != -1) {
|
if (this.arabicStart != -1 && this.arabicEnd != -1) {
|
||||||
// 生成已切分的词元
|
// 生成已切分的词元
|
||||||
Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd
|
Lexeme newLexeme =
|
||||||
- this.arabicStart + 1, Lexeme.TYPE_ARABIC);
|
new Lexeme(
|
||||||
|
context.getBufferOffset(),
|
||||||
|
this.arabicStart,
|
||||||
|
this.arabicEnd - this.arabicStart + 1,
|
||||||
|
Lexeme.TYPE_ARABIC);
|
||||||
context.addLexeme(newLexeme);
|
context.addLexeme(newLexeme);
|
||||||
this.arabicStart = -1;
|
this.arabicStart = -1;
|
||||||
this.arabicEnd = -1;
|
this.arabicEnd = -1;
|
||||||
@ -271,17 +254,14 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 判断是否锁定缓冲区
|
// 判断是否锁定缓冲区
|
||||||
if (this.arabicStart == -1 && this.arabicEnd == -1) {
|
// 对缓冲区解锁
|
||||||
// 对缓冲区解锁
|
needLock = this.arabicStart != -1 || this.arabicEnd != -1;
|
||||||
needLock = false;
|
|
||||||
} else {
|
|
||||||
needLock = true;
|
|
||||||
}
|
|
||||||
return needLock;
|
return needLock;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断是否是字母连接符号
|
* 判断是否是字母连接符号
|
||||||
|
*
|
||||||
* @param input
|
* @param input
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
@ -292,6 +272,7 @@ class LetterSegmenter implements ISegmenter {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 判断是否是数字连接符号
|
* 判断是否是数字连接符号
|
||||||
|
*
|
||||||
* @param input
|
* @param input
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
|
@ -28,33 +28,22 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/** 词典管理类,单子模式 */
|
/** 词典管理类,单例模式 */
|
||||||
public class Dictionary {
|
public class Dictionary {
|
||||||
|
|
||||||
/*
|
/** 词典单例 */
|
||||||
* 词典单子实例
|
|
||||||
*/
|
|
||||||
private static Dictionary singleton;
|
private static Dictionary singleton;
|
||||||
|
/** 主词典对象 */
|
||||||
/*
|
|
||||||
* 主词典对象
|
|
||||||
*/
|
|
||||||
private DictSegment _MainDict;
|
private DictSegment _MainDict;
|
||||||
|
/** 停止词词典 */
|
||||||
/*
|
|
||||||
* 停止词词典
|
|
||||||
*/
|
|
||||||
private DictSegment _StopWordDict;
|
private DictSegment _StopWordDict;
|
||||||
/*
|
/** 量词词典 */
|
||||||
* 量词词典
|
|
||||||
*/
|
|
||||||
private DictSegment _QuantifierDict;
|
private DictSegment _QuantifierDict;
|
||||||
|
/** 用户自定义词典路径 */
|
||||||
private static final String PATH_USER_DIC =
|
private static final String PATH_USER_DIC =
|
||||||
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
|
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
|
||||||
|
|
||||||
/** 配置对象 */
|
/** 配置对象 */
|
||||||
private Configuration cfg;
|
private final Configuration cfg;
|
||||||
|
|
||||||
private Dictionary(Configuration cfg) {
|
private Dictionary(Configuration cfg) {
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
@ -190,7 +179,7 @@ public class Dictionary {
|
|||||||
InputStream is = resource.getInputStream();
|
InputStream is = resource.getInputStream();
|
||||||
BufferedReader br =
|
BufferedReader br =
|
||||||
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||||
String theWord = null;
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
@ -210,7 +199,7 @@ public class Dictionary {
|
|||||||
// 加载扩展词典配置
|
// 加载扩展词典配置
|
||||||
List<String> extDictFiles = cfg.getExtDictionary();
|
List<String> extDictFiles = cfg.getExtDictionary();
|
||||||
if (extDictFiles != null) {
|
if (extDictFiles != null) {
|
||||||
InputStream is = null;
|
InputStream is;
|
||||||
for (String extDictName : extDictFiles) {
|
for (String extDictName : extDictFiles) {
|
||||||
// 读取扩展词典文件
|
// 读取扩展词典文件
|
||||||
System.out.println("加载扩展词典:" + extDictName);
|
System.out.println("加载扩展词典:" + extDictName);
|
||||||
@ -224,8 +213,9 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
BufferedReader br =
|
||||||
String theWord = null;
|
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||||
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
@ -234,17 +224,12 @@ public class Dictionary {
|
|||||||
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
System.err.println("Extension Dictionary loading exception.");
|
System.err.println("Extension Dictionary loading exception.");
|
||||||
ioe.printStackTrace();
|
ioe.printStackTrace();
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
if (is != null) {
|
is.close();
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
@ -271,26 +256,20 @@ public class Dictionary {
|
|||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
||||||
String theWord = null;
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
// System.out.println(theWord);
|
|
||||||
// 加载扩展停止词典数据到内存中
|
// 加载扩展停止词典数据到内存中
|
||||||
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
_StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
System.err.println("Extension Stop word Dictionary loading exception.");
|
System.err.println("Extension Stop word Dictionary loading exception.");
|
||||||
ioe.printStackTrace();
|
ioe.printStackTrace();
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
if (is != null) {
|
is.close();
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
@ -310,25 +289,21 @@ public class Dictionary {
|
|||||||
throw new RuntimeException("Quantifier Dictionary not found!!!");
|
throw new RuntimeException("Quantifier Dictionary not found!!!");
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
|
BufferedReader br =
|
||||||
String theWord = null;
|
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||||
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
_QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
|
||||||
}
|
}
|
||||||
} while (theWord != null);
|
} while (theWord != null);
|
||||||
|
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
System.err.println("Quantifier Dictionary loading exception.");
|
System.err.println("Quantifier Dictionary loading exception.");
|
||||||
ioe.printStackTrace();
|
ioe.printStackTrace();
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
if (is != null) {
|
is.close();
|
||||||
is.close();
|
|
||||||
is = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
@ -349,7 +324,7 @@ public class Dictionary {
|
|||||||
try {
|
try {
|
||||||
BufferedReader br =
|
BufferedReader br =
|
||||||
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
|
||||||
String theWord = null;
|
String theWord;
|
||||||
do {
|
do {
|
||||||
theWord = br.readLine();
|
theWord = br.readLine();
|
||||||
if (theWord != null && !"".equals(theWord.trim())) {
|
if (theWord != null && !"".equals(theWord.trim())) {
|
||||||
@ -364,7 +339,6 @@ public class Dictionary {
|
|||||||
} finally {
|
} finally {
|
||||||
try {
|
try {
|
||||||
is.close();
|
is.close();
|
||||||
is = null;
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,7 @@ import java.util.List;
|
|||||||
import java.util.concurrent.CountDownLatch;
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* BaiKeBeanIndex
|
* ArticleBeanIndex
|
||||||
*
|
*
|
||||||
* @author suwen
|
* @author suwen
|
||||||
* @date 2021/2/2 14:10
|
* @date 2021/2/2 14:10
|
||||||
|
@ -9,119 +9,120 @@ import java.text.ParseException;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.CountDownLatch;
|
import java.util.concurrent.CountDownLatch;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* BaseIndex
|
* BaseIndex
|
||||||
*
|
*
|
||||||
* @author suwen
|
* @author suwen
|
||||||
* @date 2021/2/2 14:14
|
* @date 2021/2/2 14:14
|
||||||
*/
|
*/
|
||||||
public abstract class BaseIndex<T> implements Runnable{
|
public abstract class BaseIndex<T> implements Runnable {
|
||||||
/**
|
/** 父级索引路径 */
|
||||||
* 父级索引路径
|
private String parentIndexPath;
|
||||||
*/
|
/** 索引编写器 */
|
||||||
private String parentIndexPath;
|
private IndexWriter writer;
|
||||||
/**
|
|
||||||
* 索引编写器
|
|
||||||
*/
|
|
||||||
private IndexWriter writer;
|
|
||||||
private int subIndex;
|
|
||||||
/**
|
|
||||||
* 主线程
|
|
||||||
*/
|
|
||||||
private final CountDownLatch countDownLatch1;
|
|
||||||
/**
|
|
||||||
*工作线程
|
|
||||||
*/
|
|
||||||
private final CountDownLatch countDownLatch2;
|
|
||||||
/**
|
|
||||||
* 对象列表
|
|
||||||
*/
|
|
||||||
private List<T> list;
|
|
||||||
public BaseIndex(IndexWriter writer,CountDownLatch countDownLatch1, CountDownLatch countDownLatch2,
|
|
||||||
List<T> list){
|
|
||||||
super();
|
|
||||||
this.writer = writer;
|
|
||||||
this.countDownLatch1 = countDownLatch1;
|
|
||||||
this.countDownLatch2 = countDownLatch2;
|
|
||||||
this.list = list;
|
|
||||||
}
|
|
||||||
public BaseIndex(String parentIndexPath, int subIndex,
|
|
||||||
CountDownLatch countDownLatch1, CountDownLatch countDownLatch2,
|
|
||||||
List<T> list) {
|
|
||||||
super();
|
|
||||||
this.parentIndexPath = parentIndexPath;
|
|
||||||
this.subIndex = subIndex;
|
|
||||||
try {
|
|
||||||
//多目录索引创建
|
|
||||||
File file = new File(parentIndexPath+"/index"+subIndex);
|
|
||||||
if(!file.exists()){
|
|
||||||
file.mkdir();
|
|
||||||
}
|
|
||||||
this.writer = IndexUtil.getIndexWriter(parentIndexPath+"/index"+subIndex, true);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
};
|
|
||||||
this.subIndex = subIndex;
|
|
||||||
this.countDownLatch1 = countDownLatch1;
|
|
||||||
this.countDownLatch2 = countDownLatch2;
|
|
||||||
this.list = list;
|
|
||||||
}
|
|
||||||
public BaseIndex(String path,CountDownLatch countDownLatch1, CountDownLatch countDownLatch2,
|
|
||||||
List<T> list) {
|
|
||||||
super();
|
|
||||||
try {
|
|
||||||
//单目录索引创建
|
|
||||||
File file = new File(path);
|
|
||||||
if(!file.exists()){
|
|
||||||
file.mkdir();
|
|
||||||
}
|
|
||||||
this.writer = IndexUtil.getIndexWriter(path,true);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
};
|
|
||||||
this.countDownLatch1 = countDownLatch1;
|
|
||||||
this.countDownLatch2 = countDownLatch2;
|
|
||||||
this.list = list;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**创建索引
|
private int subIndex;
|
||||||
* @param writer
|
/** 主线程 */
|
||||||
* @throws IOException
|
private final CountDownLatch countDownLatch1;
|
||||||
* @throws ParseException
|
/** 工作线程 */
|
||||||
*/
|
private final CountDownLatch countDownLatch2;
|
||||||
public abstract void indexDoc(IndexWriter writer,T t) throws Exception;
|
/** 对象列表 */
|
||||||
/**批量索引创建
|
private List<T> list;
|
||||||
* @param writer
|
|
||||||
* @param t
|
public BaseIndex(
|
||||||
* @throws Exception
|
IndexWriter writer,
|
||||||
*/
|
CountDownLatch countDownLatch1,
|
||||||
public void indexDocs(IndexWriter writer,List<T> t) throws Exception{
|
CountDownLatch countDownLatch2,
|
||||||
for (T t2 : t) {
|
List<T> list) {
|
||||||
indexDoc(writer,t2);
|
super();
|
||||||
}
|
this.writer = writer;
|
||||||
|
this.countDownLatch1 = countDownLatch1;
|
||||||
|
this.countDownLatch2 = countDownLatch2;
|
||||||
|
this.list = list;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BaseIndex(
|
||||||
|
String parentIndexPath,
|
||||||
|
int subIndex,
|
||||||
|
CountDownLatch countDownLatch1,
|
||||||
|
CountDownLatch countDownLatch2,
|
||||||
|
List<T> list) {
|
||||||
|
super();
|
||||||
|
this.parentIndexPath = parentIndexPath;
|
||||||
|
this.subIndex = subIndex;
|
||||||
|
try {
|
||||||
|
// 多目录索引创建
|
||||||
|
File file = new File(parentIndexPath + "/index" + subIndex);
|
||||||
|
if (!file.exists()) {
|
||||||
|
file.mkdir();
|
||||||
|
}
|
||||||
|
this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
;
|
||||||
|
this.subIndex = subIndex;
|
||||||
|
this.countDownLatch1 = countDownLatch1;
|
||||||
|
this.countDownLatch2 = countDownLatch2;
|
||||||
|
this.list = list;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
public BaseIndex(
|
||||||
public void run() {
|
String path, CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, List<T> list) {
|
||||||
try {
|
super();
|
||||||
countDownLatch1.await();
|
try {
|
||||||
System.out.println(writer);
|
// 单目录索引创建
|
||||||
indexDocs(writer,list);
|
File file = new File(path);
|
||||||
} catch (InterruptedException e) {
|
if (!file.exists()) {
|
||||||
e.printStackTrace();
|
file.mkdir();
|
||||||
} catch (Exception e) {
|
}
|
||||||
e.printStackTrace();
|
this.writer = IndexUtil.getIndexWriter(path, true);
|
||||||
}finally{
|
} catch (IOException e) {
|
||||||
countDownLatch2.countDown();
|
e.printStackTrace();
|
||||||
try {
|
|
||||||
writer.commit();
|
|
||||||
writer.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
;
|
||||||
|
this.countDownLatch1 = countDownLatch1;
|
||||||
|
this.countDownLatch2 = countDownLatch2;
|
||||||
|
this.list = list;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 创建索引
|
||||||
|
*
|
||||||
|
* @param writer
|
||||||
|
* @throws IOException
|
||||||
|
* @throws ParseException
|
||||||
|
*/
|
||||||
|
public abstract void indexDoc(IndexWriter writer, T t) throws Exception;
|
||||||
|
/**
|
||||||
|
* 批量索引创建
|
||||||
|
*
|
||||||
|
* @param writer
|
||||||
|
* @param t
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
public void indexDocs(IndexWriter writer, List<T> t) throws Exception {
|
||||||
|
for (T t2 : t) {
|
||||||
|
indexDoc(writer, t2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
try {
|
||||||
|
countDownLatch1.await();
|
||||||
|
System.out.println(writer);
|
||||||
|
indexDocs(writer, list);
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} finally {
|
||||||
|
countDownLatch2.countDown();
|
||||||
|
try {
|
||||||
|
writer.commit();
|
||||||
|
writer.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -22,9 +22,6 @@ public class UserDicServiceImpl implements UserDicService {
|
|||||||
|
|
||||||
@Resource private UserDicMapper userDicMapper;
|
@Resource private UserDicMapper userDicMapper;
|
||||||
|
|
||||||
/** Lucene索引文件路径 */
|
|
||||||
private final String dicPath = System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getAllDic() {
|
public List<String> getAllDic() {
|
||||||
|
|
||||||
@ -56,8 +53,12 @@ public class UserDicServiceImpl implements UserDicService {
|
|||||||
|
|
||||||
private void writeUserDic() {
|
private void writeUserDic() {
|
||||||
try {
|
try {
|
||||||
File file = new File(dicPath);
|
String filePath = "lucene/userDic/";
|
||||||
FileOutputStream stream = new FileOutputStream(file, false);
|
File file = new File(filePath);
|
||||||
|
if (!file.exists()) {
|
||||||
|
file.mkdirs();
|
||||||
|
}
|
||||||
|
FileOutputStream stream = new FileOutputStream(file + "/userDic.dic", false);
|
||||||
OutputStreamWriter outfw = new OutputStreamWriter(stream, StandardCharsets.UTF_8);
|
OutputStreamWriter outfw = new OutputStreamWriter(stream, StandardCharsets.UTF_8);
|
||||||
PrintWriter fw = new PrintWriter(new BufferedWriter(outfw));
|
PrintWriter fw = new PrintWriter(new BufferedWriter(outfw));
|
||||||
userDicMapper
|
userDicMapper
|
||||||
@ -70,7 +71,7 @@ public class UserDicServiceImpl implements UserDicService {
|
|||||||
fw.flush();
|
fw.flush();
|
||||||
fw.close();
|
fw.close();
|
||||||
Dictionary.getSingleton().updateUserDict();
|
Dictionary.getSingleton().updateUserDict();
|
||||||
} catch (FileNotFoundException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,7 +28,7 @@ public class IndexUtil {
|
|||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public static IndexWriter getIndexWriter(String indexPath,boolean create) throws IOException{
|
public static IndexWriter getIndexWriter(String indexPath,boolean create) throws IOException{
|
||||||
Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0]));
|
Directory dir = FSDirectory.open(Paths.get(indexPath));
|
||||||
Analyzer analyzer = new IKAnalyzer();
|
Analyzer analyzer = new IKAnalyzer();
|
||||||
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
|
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
|
||||||
LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
|
LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
|
||||||
|
@ -39,7 +39,7 @@ public class SearchUtil {
|
|||||||
IndexReader[] readers = new IndexReader[files.length];
|
IndexReader[] readers = new IndexReader[files.length];
|
||||||
for (int i = 0; i < files.length; i++) {
|
for (int i = 0; i < files.length; i++) {
|
||||||
readers[i] =
|
readers[i] =
|
||||||
DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath(), new String[0])));
|
DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath())));
|
||||||
}
|
}
|
||||||
reader = new MultiReader(readers);
|
reader = new MultiReader(readers);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
@ -55,7 +55,7 @@ public class SearchUtil {
|
|||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public static DirectoryReader getIndexReader(String indexPath) throws IOException {
|
public static DirectoryReader getIndexReader(String indexPath) throws IOException {
|
||||||
return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath, new String[0])));
|
return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* 根据索引路径获取IndexSearcher
|
* 根据索引路径获取IndexSearcher
|
||||||
|
Loading…
x
Reference in New Issue
Block a user