From b4e31775f8c06a09d090f9c32f64228cfa48113d Mon Sep 17 00:00:00 2001 From: suwen <577014284@qq.com> Date: Fri, 5 Feb 2021 09:08:25 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20lucene=E7=94=A8=E6=88=B7=E8=AF=8D?= =?UTF-8?q?=E5=85=B8=E6=96=87=E4=BB=B6=E5=88=9B=E5=BB=BA=E5=A4=B1=E8=B4=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 优化代码注释排版 --- .../forest/lucene/api/UserDicController.java | 1 - .../forest/lucene/cfg/DefaultConfig.java | 19 +- .../forest/lucene/core/AnalyzeContext.java | 166 ++++++-------- .../forest/lucene/core/CJKSegmenter.java | 79 +++---- .../lucene/core/CN_QuantifierSegmenter.java | 121 +++++----- .../forest/lucene/core/CharacterUtil.java | 42 ++-- .../forest/lucene/core/IKArbitrator.java | 54 ++--- .../rymcu/forest/lucene/core/IKSegmenter.java | 67 +++--- .../forest/lucene/core/LetterSegmenter.java | 177 +++++++-------- .../rymcu/forest/lucene/dic/Dictionary.java | 66 ++---- .../lucene/lucene/ArticleBeanIndex.java | 2 +- .../rymcu/forest/lucene/lucene/BaseIndex.java | 213 +++++++++--------- .../service/impl/UserDicServiceImpl.java | 13 +- .../rymcu/forest/lucene/util/IndexUtil.java | 2 +- .../rymcu/forest/lucene/util/SearchUtil.java | 4 +- 15 files changed, 447 insertions(+), 579 deletions(-) diff --git a/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java b/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java index 501fcae..0a085ec 100755 --- a/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java +++ b/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java @@ -7,7 +7,6 @@ import com.rymcu.forest.core.result.GlobalResultGenerator; import com.rymcu.forest.lucene.model.UserDic; import com.rymcu.forest.lucene.service.UserDicService; import com.rymcu.forest.util.Utils; -import lombok.extern.log4j.Log4j2; import org.springframework.web.bind.annotation.*; import javax.annotation.Resource; diff --git a/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java b/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java index f884e57..d8dd9be 100755 --- a/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java +++ b/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java @@ -30,24 +30,16 @@ public class DefaultConfig implements Configuration { /** 分词器默认字典路径 */ private static final String PATH_DIC_MAIN = "lucene/main2012.dic"; - + /** 题词字典路径 */ private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic"; + /** 用户自定义字典路径 */ private static final String PATH_USER_DIC = System.getProperty("user.dir") + "/lucene/userDic/userDic.dic"; - - /** 分词器配置文件路径 */ - private static final String FILE_NAME = "IKAnalyzer.cfg.xml"; - // 配置属性——扩展字典 - private static final String EXT_DICT = "ext_dic"; - // 配置属性——扩展停止词典 - private static final String EXT_STOP = "ext_stopword"; - + /** 配置属性——扩展字典 */ private String extDic = "lucene/ext.dic;" + PATH_USER_DIC; - + /** 配置属性——扩展停止词典 */ private String extStopword = "lucene/stopword.dic"; - /* - * 是否使用smart方式分词 - */ + /** 是否使用smart方式分词 */ private boolean useSmart; /** @@ -138,5 +130,4 @@ public class DefaultConfig implements Configuration { } return extStopWordDictFiles; } - } diff --git a/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java b/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java index ae8284f..0450695 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java +++ b/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java @@ -1,30 +1,24 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *
Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *
http://www.apache.org/licenses/LICENSE-2.0 + * + *
Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * + *
源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
*/
package com.rymcu.forest.lucene.core;
-
import com.rymcu.forest.lucene.cfg.Configuration;
import com.rymcu.forest.lucene.dic.Dictionary;
@@ -32,53 +26,42 @@ import java.io.IOException;
import java.io.Reader;
import java.util.*;
-/**
- *
- * 分词器上下文状态
- *
- */
+/** 分词器上下文状态 */
class AnalyzeContext {
- // 默认缓冲区大小
+ /** 默认缓冲区大小 */
private static final int BUFF_SIZE = 4096;
- // 缓冲区耗尽的临界值
+ /** 缓冲区耗尽的临界值 */
private static final int BUFF_EXHAUST_CRITICAL = 100;
-
- // 字符窜读取缓冲
+ /** 字符窜读取缓冲 */
private char[] segmentBuff;
- // 字符类型数组
+ /** 字符类型数组 */
private int[] charTypes;
-
- // 记录Reader内已分析的字串总长度
- // 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
+ /** 记录Reader内已分析的字串总长度, 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 */
private int buffOffset;
- // 当前缓冲区位置指针
+ /** 当前缓冲区位置指针 */
private int cursor;
- // 最近一次读入的,可处理的字串长度
+ /** 最近一次读入的,可处理的字串长度 */
private int available;
-
- // 子分词器锁
- // 该集合非空,说明有子分词器在占用segmentBuff
- private Set 满足一下条件时, 1.available == BUFF_SIZE 表示buffer满载 2.buffIndex < available - 1 && buffIndex >
+ * available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 3.!context.isBufferLocked()表示没有segmenter在占用buffer
* 要中断当前循环(buffer要进行移位,并再读取数据的操作)
+ *
* @return
*/
boolean needRefillBuffer() {
- return this.available == BUFF_SIZE && this.cursor < this.available - 1
- && this.cursor > this.available - BUFF_EXHAUST_CRITICAL && !this.isBufferLocked();
+ return this.available == BUFF_SIZE
+ && this.cursor < this.available - 1
+ && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
+ && !this.isBufferLocked();
}
- /**
- * 累计当前的segmentBuff相对于reader起始位置的位移
- */
+ /** 累计当前的segmentBuff相对于reader起始位置的位移 */
void markBufferOffset() {
this.buffOffset += this.cursor;
}
/**
* 向分词结果集添加词元
+ *
* @param lexeme
*/
void addLexeme(Lexeme lexeme) {
@@ -220,8 +199,8 @@ class AnalyzeContext {
}
/**
- * 添加分词结果路径
- * 路径起始位置 ---> 路径 映射表
+ * 添加分词结果路径 路径起始位置 ---> 路径 映射表
+ *
* @param path
*/
void addLexemePath(LexemePath path) {
@@ -232,6 +211,7 @@ class AnalyzeContext {
/**
* 返回原始分词结果
+ *
* @return
*/
QuickSortSet getOrgLexemes() {
@@ -239,14 +219,12 @@ class AnalyzeContext {
}
/**
- * 推送分词结果到结果集合
- * 1.从buff头部遍历到this.cursor已处理位置
- * 2.将map中存在的分词结果推入results
+ * 推送分词结果到结果集合 1.从buff头部遍历到this.cursor已处理位置 2.将map中存在的分词结果推入results
* 3.将map中不存在的CJDK字符以单字方式推入results
*/
void outputToResult() {
int index = 0;
- for (; index <= this.cursor;) {
+ while (index <= this.cursor) {
// 跳过非CJK字符
if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
index++;
@@ -269,7 +247,7 @@ class AnalyzeContext {
}
}
}
- } else {// pathMap中找不到index对应的LexemePath
+ } else { // pathMap中找不到index对应的LexemePath
// 单字输出
this.outputSingleCJK(index);
index++;
@@ -281,6 +259,7 @@ class AnalyzeContext {
/**
* 对CJK字符进行单字输出
+ *
* @param index
*/
private void outputSingleCJK(int index) {
@@ -294,9 +273,10 @@ class AnalyzeContext {
}
/**
- * 返回lexeme
- *
- * 同时处理合并
+ * 返回lexeme
+ *
+ * 同时处理合并
+ *
* @return
*/
Lexeme getNextLexeme() {
@@ -305,8 +285,8 @@ class AnalyzeContext {
while (result != null) {
// 数量词合并
this.compound(result);
- if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(),
- result.getLength())) {
+ if (Dictionary.getSingleton()
+ .isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
// 是停止词继续取列表的下一个
result = this.results.pollFirst();
} else {
@@ -318,9 +298,7 @@ class AnalyzeContext {
return result;
}
- /**
- * 重置分词上下文状态
- */
+ /** 重置分词上下文状态 */
void reset() {
this.buffLocker.clear();
this.orgLexemes = new QuickSortSet();
@@ -333,9 +311,7 @@ class AnalyzeContext {
this.pathMap.clear();
}
- /**
- * 组合词元
- */
+ /** 组合词元 */
private void compound(Lexeme result) {
if (!this.cfg.useSmart()) {
return;
@@ -372,8 +348,6 @@ class AnalyzeContext {
this.results.pollFirst();
}
}
-
}
}
-
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java
index 774f8a0..948beca 100755
--- a/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java
@@ -1,55 +1,42 @@
/**
- * IK 中文分词 版本 5.0
- * IK Analyzer release 5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
+ * IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
*/
package com.rymcu.forest.lucene.core;
-
-
import com.rymcu.forest.lucene.dic.Dictionary;
import com.rymcu.forest.lucene.dic.Hit;
import java.util.LinkedList;
import java.util.List;
-/**
- * 中文-日韩文子分词器
- */
+/** 中文-日韩文子分词器 */
class CJKSegmenter implements ISegmenter {
- // 子分词器标签
+ /** 子分词器标签 */
static final String SEGMENTER_NAME = "CJK_SEGMENTER";
- // 待处理的分词hit队列
+ /** 待处理的分词hit队列 */
private List Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
*/
package com.rymcu.forest.lucene.core;
-
import com.rymcu.forest.lucene.dic.Dictionary;
import com.rymcu.forest.lucene.dic.Hit;
@@ -33,19 +27,17 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Set;
-
-/**
- *
- * 中文数量词子分词器
- */
+/** 中文数量词子分词器 */
class CN_QuantifierSegmenter implements ISegmenter {
- // 子分词器标签
+ /** 子分词器标签 */
static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
- // 中文数词
- private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";// Cnum
- private static Set Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
- * 字符集识别工具类
+ * 源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
+ *
+ * 字符集识别工具类
*/
package com.rymcu.forest.lucene.core;
-/**
- *
- * 字符集识别工具类
- */
+/** 字符集识别工具类 */
class CharacterUtil {
public static final int CHAR_USELESS = 0;
@@ -43,6 +36,7 @@ class CharacterUtil {
/**
* 识别字符类型
+ *
* @param input
* @return int CharacterUtil定义的字符类型常量
*/
@@ -72,7 +66,6 @@ class CharacterUtil {
|| ub == Character.UnicodeBlock.KATAKANA // 片假名
|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
return CHAR_OTHER_CJK;
-
}
}
// 其他的不做处理的字符
@@ -81,6 +74,7 @@ class CharacterUtil {
/**
* 进行字符规格化(全角转半角,大写转小写处理)
+ *
* @param input
* @return char
*/
diff --git a/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java b/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java
index be8cc36..401242e 100755
--- a/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java
@@ -1,44 +1,36 @@
/**
- * IK 中文分词 版本 5.0
- * IK Analyzer release 5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
+ * IK 中文分词 版本 5.0 IK Analyzer release 5.0
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import java.util.Stack;
import java.util.TreeSet;
-/**
- * IK分词歧义裁决器
- */
+/** IK分词歧义裁决器 */
class IKArbitrator {
- IKArbitrator() {
-
- }
+ IKArbitrator() {}
/**
* 分词歧义处理
- * @param orgLexemes
+ *
+ * @param context
* @param useSmart
*/
void process(AnalyzeContext context, boolean useSmart) {
@@ -84,9 +76,10 @@ class IKArbitrator {
/**
* 歧义识别
+ *
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
- * @param option 候选结果路径
+ * @param fullTextLength 候选结果路径
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
@@ -114,12 +107,12 @@ class IKArbitrator {
// 返回集合中的最优方案
return pathOptions.first();
-
}
/**
* 向前遍历,添加词元,构造一个无歧义词元组合
- * @param LexemePath path
+ *
+ * @param option path
* @return
*/
private Stack Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
+ * 源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
*/
package com.rymcu.forest.lucene.core;
@@ -32,30 +28,26 @@ import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
-/**
- * IK分词器主类
- *
- */
+/** IK分词器主类 */
public final class IKSegmenter {
- // 字符窜reader
+ /** 字符窜reader */
private Reader input;
- // 分词器配置项
+ /** 分词器配置项 */
private Configuration cfg;
- // 分词器上下文
+ /** 分词器上下文 */
private AnalyzeContext context;
- // 分词处理器列表
+ /** 分词处理器列表 */
private List 非智能分词:细粒度输出所有可能的切分结果 智能分词: 合并数词和量词,对分词结果进行歧义判断
*/
public IKSegmenter(Reader input, boolean useSmart) {
this.input = input;
@@ -66,9 +58,9 @@ public final class IKSegmenter {
/**
* IK分词器构造函数
+ *
* @param input
* @param cfg 使用自定义的Configuration构造分词器
- *
*/
public IKSegmenter(Reader input, Configuration cfg) {
this.input = input;
@@ -76,9 +68,7 @@ public final class IKSegmenter {
this.init();
}
- /**
- * 初始化
- */
+ /** 初始化 */
private void init() {
// 初始化词典单例
Dictionary.initial(this.cfg);
@@ -92,6 +82,7 @@ public final class IKSegmenter {
/**
* 初始化词典,加载子分词器实现
+ *
* @return List Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012
+ * by Oolong studio
*/
package com.rymcu.forest.lucene.core;
import java.util.Arrays;
-/**
- *
- * 英文字符及阿拉伯数字子分词器
- */
+/** 英文字符及阿拉伯数字子分词器 */
class LetterSegmenter implements ISegmenter {
- // 子分词器标签
+ /** 子分词器标签 */
static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
- // 链接符号
- private static final char[] Letter_Connector = new char[] { '#', '&', '+', '-', '.', '@', '_' };
-
- // 数字符号
- private static final char[] Num_Connector = new char[] { ',', '.' };
-
- /*
- * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
- */
+ /** 链接符号 */
+ private static final char[] Letter_Connector = new char[] {'#', '&', '+', '-', '.', '@', '_'};
+ /** 数字符号 */
+ private static final char[] Num_Connector = new char[] {',', '.'};
+ /** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
private int start;
- /*
- * 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
- */
+ /** 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 */
private int end;
-
- /*
- * 字母起始位置
- */
+ /** 字母起始位置 */
private int englishStart;
-
- /*
- * 字母结束位置
- */
+ /** 字母结束位置 */
private int englishEnd;
-
- /*
- * 阿拉伯数字起始位置
- */
+ /** 阿拉伯数字起始位置 */
private int arabicStart;
-
- /*
- * 阿拉伯数字结束位置
- */
+ /** 阿拉伯数字结束位置 */
private int arabicEnd;
LetterSegmenter() {
@@ -80,10 +54,6 @@ class LetterSegmenter implements ISegmenter {
this.arabicEnd = -1;
}
- /*
- * (non-Javadoc)
- * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
- */
@Override
public void analyze(AnalyzeContext context) {
boolean bufferLockFlag = false;
@@ -103,10 +73,6 @@ class LetterSegmenter implements ISegmenter {
}
}
- /*
- * (non-Javadoc)
- * @see org.wltea.analyzer.core.ISegmenter#reset()
- */
@Override
public void reset() {
this.start = -1;
@@ -118,16 +84,15 @@ class LetterSegmenter implements ISegmenter {
}
/**
- * 处理数字字母混合输出
- * 如:windos2000 | linliangyi2005@gmail.com
- * @param input
+ * 处理数字字母混合输出 如:windos2000 | linliangyi2005@gmail.com
+ *
* @param context
* @return
*/
private boolean processMixLetter(AnalyzeContext context) {
boolean needLock = false;
- if (this.start == -1) {// 当前的分词器尚未开始处理字符
+ if (this.start == -1) { // 当前的分词器尚未开始处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态
@@ -135,7 +100,7 @@ class LetterSegmenter implements ISegmenter {
this.end = start;
}
- } else {// 当前的分词器正在处理字符
+ } else { // 当前的分词器正在处理字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录下可能的结束位置
@@ -147,8 +112,12 @@ class LetterSegmenter implements ISegmenter {
this.end = context.getCursor();
} else {
// 遇到非Letter字符,输出词元
- Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start
- + 1, Lexeme.TYPE_LETTER);
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.start,
+ this.end - this.start + 1,
+ Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
@@ -159,8 +128,12 @@ class LetterSegmenter implements ISegmenter {
if (context.isBufferConsumed()) {
if (this.start != -1 && this.end != -1) {
// 缓冲以读完,输出词元
- Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start
- + 1, Lexeme.TYPE_LETTER);
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.start,
+ this.end - this.start + 1,
+ Lexeme.TYPE_LETTER);
context.addLexeme(newLexeme);
this.start = -1;
this.end = -1;
@@ -168,37 +141,38 @@ class LetterSegmenter implements ISegmenter {
}
// 判断是否锁定缓冲区
- if (this.start == -1 && this.end == -1) {
- // 对缓冲区解锁
- needLock = false;
- } else {
- needLock = true;
- }
+ // 对缓冲区解锁
+ needLock = this.start != -1 || this.end != -1;
return needLock;
}
/**
* 处理纯英文字母输出
+ *
* @param context
* @return
*/
private boolean processEnglishLetter(AnalyzeContext context) {
boolean needLock = false;
- if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符
+ if (this.englishStart == -1) { // 当前的分词器尚未开始处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态
this.englishStart = context.getCursor();
this.englishEnd = this.englishStart;
}
- } else {// 当前的分词器正在处理英文字符
+ } else { // 当前的分词器正在处理英文字符
if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
// 记录当前指针位置为结束位置
this.englishEnd = context.getCursor();
} else {
// 遇到非English字符,输出词元
- Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd
- - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.englishStart,
+ this.englishEnd - this.englishStart + 1,
+ Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
@@ -209,8 +183,12 @@ class LetterSegmenter implements ISegmenter {
if (context.isBufferConsumed()) {
if (this.englishStart != -1 && this.englishEnd != -1) {
// 缓冲以读完,输出词元
- Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd
- - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.englishStart,
+ this.englishEnd - this.englishStart + 1,
+ Lexeme.TYPE_ENGLISH);
context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
@@ -218,30 +196,27 @@ class LetterSegmenter implements ISegmenter {
}
// 判断是否锁定缓冲区
- if (this.englishStart == -1 && this.englishEnd == -1) {
- // 对缓冲区解锁
- needLock = false;
- } else {
- needLock = true;
- }
+ // 对缓冲区解锁
+ needLock = this.englishStart != -1 || this.englishEnd != -1;
return needLock;
}
/**
* 处理阿拉伯数字输出
+ *
* @param context
* @return
*/
private boolean processArabicLetter(AnalyzeContext context) {
boolean needLock = false;
- if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符
+ if (this.arabicStart == -1) { // 当前的分词器尚未开始处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
// 记录起始指针的位置,标明分词器进入处理状态
this.arabicStart = context.getCursor();
this.arabicEnd = this.arabicStart;
}
- } else {// 当前的分词器正在处理数字字符
+ } else { // 当前的分词器正在处理数字字符
if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
// 记录当前指针位置为结束位置
this.arabicEnd = context.getCursor();
@@ -250,8 +225,12 @@ class LetterSegmenter implements ISegmenter {
// 不输出数字,但不标记结束
} else {
// //遇到非Arabic字符,输出词元
- Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd
- - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.arabicStart,
+ this.arabicEnd - this.arabicStart + 1,
+ Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
@@ -262,8 +241,12 @@ class LetterSegmenter implements ISegmenter {
if (context.isBufferConsumed()) {
if (this.arabicStart != -1 && this.arabicEnd != -1) {
// 生成已切分的词元
- Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd
- - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.arabicStart,
+ this.arabicEnd - this.arabicStart + 1,
+ Lexeme.TYPE_ARABIC);
context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
@@ -271,17 +254,14 @@ class LetterSegmenter implements ISegmenter {
}
// 判断是否锁定缓冲区
- if (this.arabicStart == -1 && this.arabicEnd == -1) {
- // 对缓冲区解锁
- needLock = false;
- } else {
- needLock = true;
- }
+ // 对缓冲区解锁
+ needLock = this.arabicStart != -1 || this.arabicEnd != -1;
return needLock;
}
/**
* 判断是否是字母连接符号
+ *
* @param input
* @return
*/
@@ -292,6 +272,7 @@ class LetterSegmenter implements ISegmenter {
/**
* 判断是否是数字连接符号
+ *
* @param input
* @return
*/
diff --git a/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java b/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java
index 979daf8..a40ffa3 100755
--- a/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java
+++ b/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java
@@ -28,33 +28,22 @@ import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;
-/** 词典管理类,单子模式 */
+/** 词典管理类,单例模式 */
public class Dictionary {
- /*
- * 词典单子实例
- */
+ /** 词典单例 */
private static Dictionary singleton;
-
- /*
- * 主词典对象
- */
+ /** 主词典对象 */
private DictSegment _MainDict;
-
- /*
- * 停止词词典
- */
+ /** 停止词词典 */
private DictSegment _StopWordDict;
- /*
- * 量词词典
- */
+ /** 量词词典 */
private DictSegment _QuantifierDict;
-
+ /** 用户自定义词典路径 */
private static final String PATH_USER_DIC =
System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
-
/** 配置对象 */
- private Configuration cfg;
+ private final Configuration cfg;
private Dictionary(Configuration cfg) {
this.cfg = cfg;
@@ -190,7 +179,7 @@ public class Dictionary {
InputStream is = resource.getInputStream();
BufferedReader br =
new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
- String theWord = null;
+ String theWord;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
@@ -210,7 +199,7 @@ public class Dictionary {
// 加载扩展词典配置
List