diff --git a/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java b/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java index 501fcae..0a085ec 100755 --- a/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java +++ b/src/main/java/com/rymcu/forest/lucene/api/UserDicController.java @@ -7,7 +7,6 @@ import com.rymcu.forest.core.result.GlobalResultGenerator; import com.rymcu.forest.lucene.model.UserDic; import com.rymcu.forest.lucene.service.UserDicService; import com.rymcu.forest.util.Utils; -import lombok.extern.log4j.Log4j2; import org.springframework.web.bind.annotation.*; import javax.annotation.Resource; diff --git a/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java b/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java index f884e57..d8dd9be 100755 --- a/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java +++ b/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java @@ -30,24 +30,16 @@ public class DefaultConfig implements Configuration { /** 分词器默认字典路径 */ private static final String PATH_DIC_MAIN = "lucene/main2012.dic"; - + /** 题词字典路径 */ private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic"; + /** 用户自定义字典路径 */ private static final String PATH_USER_DIC = System.getProperty("user.dir") + "/lucene/userDic/userDic.dic"; - - /** 分词器配置文件路径 */ - private static final String FILE_NAME = "IKAnalyzer.cfg.xml"; - // 配置属性——扩展字典 - private static final String EXT_DICT = "ext_dic"; - // 配置属性——扩展停止词典 - private static final String EXT_STOP = "ext_stopword"; - + /** 配置属性——扩展字典 */ private String extDic = "lucene/ext.dic;" + PATH_USER_DIC; - + /** 配置属性——扩展停止词典 */ private String extStopword = "lucene/stopword.dic"; - /* - * 是否使用smart方式分词 - */ + /** 是否使用smart方式分词 */ private boolean useSmart; /** @@ -138,5 +130,4 @@ public class DefaultConfig implements Configuration { } return extStopWordDictFiles; } - } diff --git a/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java b/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java index ae8284f..0450695 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java +++ b/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java @@ -1,30 +1,24 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio */ package com.rymcu.forest.lucene.core; - import com.rymcu.forest.lucene.cfg.Configuration; import com.rymcu.forest.lucene.dic.Dictionary; @@ -32,53 +26,42 @@ import java.io.IOException; import java.io.Reader; import java.util.*; -/** - * - * 分词器上下文状态 - * - */ +/** 分词器上下文状态 */ class AnalyzeContext { - // 默认缓冲区大小 + /** 默认缓冲区大小 */ private static final int BUFF_SIZE = 4096; - // 缓冲区耗尽的临界值 + /** 缓冲区耗尽的临界值 */ private static final int BUFF_EXHAUST_CRITICAL = 100; - - // 字符窜读取缓冲 + /** 字符窜读取缓冲 */ private char[] segmentBuff; - // 字符类型数组 + /** 字符类型数组 */ private int[] charTypes; - - // 记录Reader内已分析的字串总长度 - // 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 + /** 记录Reader内已分析的字串总长度, 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 */ private int buffOffset; - // 当前缓冲区位置指针 + /** 当前缓冲区位置指针 */ private int cursor; - // 最近一次读入的,可处理的字串长度 + /** 最近一次读入的,可处理的字串长度 */ private int available; - - // 子分词器锁 - // 该集合非空,说明有子分词器在占用segmentBuff - private Set buffLocker; - - // 原始分词结果集合,未经歧义处理 + /** 子分词器锁, 该集合非空,说明有子分词器在占用segmentBuff */ + private final Set buffLocker; + /** 原始分词结果集合,未经歧义处理 */ private QuickSortSet orgLexemes; - // LexemePath位置索引表 - private Map pathMap; - // 最终分词结果集 - private LinkedList results; - - // 分词器配置项 - private Configuration cfg; + /** LexemePath位置索引表 */ + private final Map pathMap; + /** 最终分词结果集 */ + private final LinkedList results; + /** 分词器配置项 */ + private final Configuration cfg; public AnalyzeContext(Configuration cfg) { this.cfg = cfg; this.segmentBuff = new char[BUFF_SIZE]; this.charTypes = new int[BUFF_SIZE]; - this.buffLocker = new HashSet(); + this.buffLocker = new HashSet<>(); this.orgLexemes = new QuickSortSet(); - this.pathMap = new HashMap(); - this.results = new LinkedList(); + this.pathMap = new HashMap<>(); + this.results = new LinkedList<>(); } int getCursor() { @@ -102,10 +85,11 @@ class AnalyzeContext { } /** - * 根据context的上下文情况,填充segmentBuff + * 根据context的上下文情况,填充segmentBuff + * * @param reader * @return 返回待分析的(有效的)字串长度 - * @throws IOException + * @throws IOException */ int fillBuffer(Reader reader) throws IOException { int readCount = 0; @@ -129,20 +113,14 @@ class AnalyzeContext { return readCount; } - /** - * 初始化buff指针,处理第一个字符 - */ + /** 初始化buff指针,处理第一个字符 */ void initCursor() { this.cursor = 0; this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); } - /** - * 指针+1 - * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false - * 并处理当前字符 - */ + /** 指针+1 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 并处理当前字符 */ boolean moveCursor() { if (this.cursor < this.available - 1) { this.cursor++; @@ -155,8 +133,8 @@ class AnalyzeContext { } /** - * 设置当前segmentBuff为锁定状态 - * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff + * 设置当前segmentBuff为锁定状态 加入占用segmentBuff的子分词器名称,表示占用segmentBuff + * * @param segmenterName */ void lockBuffer(String segmenterName) { @@ -165,6 +143,7 @@ class AnalyzeContext { /** * 移除指定的子分词器名,释放对segmentBuff的占用 + * * @param segmenterName */ void unlockBuffer(String segmenterName) { @@ -172,8 +151,8 @@ class AnalyzeContext { } /** - * 只要buffLocker中存在segmenterName - * 则buffer被锁定 + * 只要buffLocker中存在segmenterName 则buffer被锁定 + * * @return boolean 缓冲去是否被锁定 */ boolean isBufferLocked() { @@ -181,8 +160,8 @@ class AnalyzeContext { } /** - * 判断当前segmentBuff是否已经用完 - * 当前执针cursor移至segmentBuff末端this.available - 1 + * 判断当前segmentBuff是否已经用完 当前执针cursor移至segmentBuff末端this.available - 1 + * * @return */ boolean isBufferConsumed() { @@ -191,28 +170,28 @@ class AnalyzeContext { /** * 判断segmentBuff是否需要读取新数据 - * - * 满足一下条件时, - * 1.available == BUFF_SIZE 表示buffer满载 - * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 - * 3.!context.isBufferLocked()表示没有segmenter在占用buffer + * + *

满足一下条件时, 1.available == BUFF_SIZE 表示buffer满载 2.buffIndex < available - 1 && buffIndex > + * available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 3.!context.isBufferLocked()表示没有segmenter在占用buffer * 要中断当前循环(buffer要进行移位,并再读取数据的操作) + * * @return */ boolean needRefillBuffer() { - return this.available == BUFF_SIZE && this.cursor < this.available - 1 - && this.cursor > this.available - BUFF_EXHAUST_CRITICAL && !this.isBufferLocked(); + return this.available == BUFF_SIZE + && this.cursor < this.available - 1 + && this.cursor > this.available - BUFF_EXHAUST_CRITICAL + && !this.isBufferLocked(); } - /** - * 累计当前的segmentBuff相对于reader起始位置的位移 - */ + /** 累计当前的segmentBuff相对于reader起始位置的位移 */ void markBufferOffset() { this.buffOffset += this.cursor; } /** * 向分词结果集添加词元 + * * @param lexeme */ void addLexeme(Lexeme lexeme) { @@ -220,8 +199,8 @@ class AnalyzeContext { } /** - * 添加分词结果路径 - * 路径起始位置 ---> 路径 映射表 + * 添加分词结果路径 路径起始位置 ---> 路径 映射表 + * * @param path */ void addLexemePath(LexemePath path) { @@ -232,6 +211,7 @@ class AnalyzeContext { /** * 返回原始分词结果 + * * @return */ QuickSortSet getOrgLexemes() { @@ -239,14 +219,12 @@ class AnalyzeContext { } /** - * 推送分词结果到结果集合 - * 1.从buff头部遍历到this.cursor已处理位置 - * 2.将map中存在的分词结果推入results + * 推送分词结果到结果集合 1.从buff头部遍历到this.cursor已处理位置 2.将map中存在的分词结果推入results * 3.将map中不存在的CJDK字符以单字方式推入results */ void outputToResult() { int index = 0; - for (; index <= this.cursor;) { + while (index <= this.cursor) { // 跳过非CJK字符 if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) { index++; @@ -269,7 +247,7 @@ class AnalyzeContext { } } } - } else {// pathMap中找不到index对应的LexemePath + } else { // pathMap中找不到index对应的LexemePath // 单字输出 this.outputSingleCJK(index); index++; @@ -281,6 +259,7 @@ class AnalyzeContext { /** * 对CJK字符进行单字输出 + * * @param index */ private void outputSingleCJK(int index) { @@ -294,9 +273,10 @@ class AnalyzeContext { } /** - * 返回lexeme - * - * 同时处理合并 + * 返回lexeme + * + *

同时处理合并 + * * @return */ Lexeme getNextLexeme() { @@ -305,8 +285,8 @@ class AnalyzeContext { while (result != null) { // 数量词合并 this.compound(result); - if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), - result.getLength())) { + if (Dictionary.getSingleton() + .isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) { // 是停止词继续取列表的下一个 result = this.results.pollFirst(); } else { @@ -318,9 +298,7 @@ class AnalyzeContext { return result; } - /** - * 重置分词上下文状态 - */ + /** 重置分词上下文状态 */ void reset() { this.buffLocker.clear(); this.orgLexemes = new QuickSortSet(); @@ -333,9 +311,7 @@ class AnalyzeContext { this.pathMap.clear(); } - /** - * 组合词元 - */ + /** 组合词元 */ private void compound(Lexeme result) { if (!this.cfg.useSmart()) { return; @@ -372,8 +348,6 @@ class AnalyzeContext { this.results.pollFirst(); } } - } } - } diff --git a/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java index 774f8a0..948beca 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java +++ b/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java @@ -1,55 +1,42 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio */ package com.rymcu.forest.lucene.core; - - import com.rymcu.forest.lucene.dic.Dictionary; import com.rymcu.forest.lucene.dic.Hit; import java.util.LinkedList; import java.util.List; -/** - * 中文-日韩文子分词器 - */ +/** 中文-日韩文子分词器 */ class CJKSegmenter implements ISegmenter { - // 子分词器标签 + /** 子分词器标签 */ static final String SEGMENTER_NAME = "CJK_SEGMENTER"; - // 待处理的分词hit队列 + /** 待处理的分词hit队列 */ private List tmpHits; CJKSegmenter() { this.tmpHits = new LinkedList(); } - /* - * (non-Javadoc) - * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) - */ @Override public void analyze(AnalyzeContext context) { if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) { @@ -59,15 +46,20 @@ class CJKSegmenter implements ISegmenter { // 处理词段队列 Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); for (Hit hit : tmpArray) { - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), - context.getCursor(), hit); + hit = + Dictionary.getSingleton() + .matchWithHit(context.getSegmentBuff(), context.getCursor(), hit); if (hit.isMatch()) { // 输出当前的词 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), - context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + hit.getBegin(), + context.getCursor() - hit.getBegin() + 1, + Lexeme.TYPE_CNWORD); context.addLexeme(newLexeme); - if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除 + if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除 this.tmpHits.remove(hit); } @@ -78,14 +70,14 @@ class CJKSegmenter implements ISegmenter { } } - // ********************************* // 再对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), - context.getCursor(), 1); - if (singleCharHit.isMatch()) {// 首字成词 + Hit singleCharHit = + Dictionary.getSingleton() + .matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); + if (singleCharHit.isMatch()) { // 首字成词 // 输出当前的词 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, - Lexeme.TYPE_CNWORD); + Lexeme newLexeme = + new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD); context.addLexeme(newLexeme); // 同时也是词前缀 @@ -93,7 +85,7 @@ class CJKSegmenter implements ISegmenter { // 前缀匹配则放入hit列表 this.tmpHits.add(singleCharHit); } - } else if (singleCharHit.isPrefix()) {// 首字为词前缀 + } else if (singleCharHit.isPrefix()) { // 首字为词前缀 // 前缀匹配则放入hit列表 this.tmpHits.add(singleCharHit); } @@ -119,14 +111,9 @@ class CJKSegmenter implements ISegmenter { } } - /* - * (non-Javadoc) - * @see org.wltea.analyzer.core.ISegmenter#reset() - */ @Override public void reset() { // 清空队列 this.tmpHits.clear(); } - } diff --git a/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java index 1a73b31..cc0227d 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java +++ b/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java @@ -1,30 +1,24 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio */ package com.rymcu.forest.lucene.core; - import com.rymcu.forest.lucene.dic.Dictionary; import com.rymcu.forest.lucene.dic.Hit; @@ -33,19 +27,17 @@ import java.util.LinkedList; import java.util.List; import java.util.Set; - -/** - * - * 中文数量词子分词器 - */ +/** 中文数量词子分词器 */ class CN_QuantifierSegmenter implements ISegmenter { - // 子分词器标签 + /** 子分词器标签 */ static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; - // 中文数词 - private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";// Cnum - private static Set ChnNumberChars = new HashSet(); + /** 中文数词 */ + private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿"; + + private static Set ChnNumberChars = new HashSet<>(); + static { char[] ca = Chn_Num.toCharArray(); for (char nChar : ca) { @@ -53,17 +45,13 @@ class CN_QuantifierSegmenter implements ISegmenter { } } - /* - * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 - */ + /** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */ private int nStart; - /* - * 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束 - */ + /** 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束 */ private int nEnd; - // 待处理的量词hit队列 - private List countHits; + /** 待处理的量词hit队列 */ + private final List countHits; CN_QuantifierSegmenter() { nStart = -1; @@ -71,16 +59,13 @@ class CN_QuantifierSegmenter implements ISegmenter { this.countHits = new LinkedList(); } - /** - * 分词 - */ + /** 分词 */ @Override public void analyze(AnalyzeContext context) { // 处理中文数词 this.processCNumber(context); // 处理中文量词 this.processCount(context); - // 判断是否锁定缓冲区 if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) { // 对缓冲区解锁 @@ -90,9 +75,7 @@ class CN_QuantifierSegmenter implements ISegmenter { } } - /** - * 重置子分词器状态 - */ + /** 重置子分词器状态 */ @Override public void reset() { nStart = -1; @@ -100,18 +83,16 @@ class CN_QuantifierSegmenter implements ISegmenter { countHits.clear(); } - /** - * 处理数词 - */ + /** 处理数词 */ private void processCNumber(AnalyzeContext context) { - if (nStart == -1 && nEnd == -1) {// 初始状态 + if (nStart == -1 && nEnd == -1) { // 初始状态 if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() && ChnNumberChars.contains(context.getCurrentChar())) { // 记录数词的起始、结束位置 nStart = context.getCursor(); nEnd = context.getCursor(); } - } else {// 正在处理状态 + } else { // 正在处理状态 if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() && ChnNumberChars.contains(context.getCurrentChar())) { // 记录数词的结束位置 @@ -139,6 +120,7 @@ class CN_QuantifierSegmenter implements ISegmenter { /** * 处理中文量词 + * * @param context */ private void processCount(AnalyzeContext context) { @@ -146,23 +128,26 @@ class CN_QuantifierSegmenter implements ISegmenter { if (!this.needCountScan(context)) { return; } - if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) { - // 优先处理countHits中的hit if (!this.countHits.isEmpty()) { // 处理词段队列 Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); for (Hit hit : tmpArray) { - hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), - context.getCursor(), hit); + hit = + Dictionary.getSingleton() + .matchWithHit(context.getSegmentBuff(), context.getCursor(), hit); if (hit.isMatch()) { // 输出当前的词 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), - context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + hit.getBegin(), + context.getCursor() - hit.getBegin() + 1, + Lexeme.TYPE_COUNT); context.addLexeme(newLexeme); - if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除 + if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除 this.countHits.remove(hit); } @@ -172,33 +157,29 @@ class CN_QuantifierSegmenter implements ISegmenter { } } } - - // ********************************* // 对当前指针位置的字符进行单字匹配 - Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), - context.getCursor(), 1); - if (singleCharHit.isMatch()) {// 首字成量词词 + Hit singleCharHit = + Dictionary.getSingleton() + .matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); + if (singleCharHit.isMatch()) { // 首字成量词词 // 输出当前的词 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, - Lexeme.TYPE_COUNT); + Lexeme newLexeme = + new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT); context.addLexeme(newLexeme); - // 同时也是词前缀 if (singleCharHit.isPrefix()) { // 前缀匹配则放入hit列表 this.countHits.add(singleCharHit); } - } else if (singleCharHit.isPrefix()) {// 首字为量词前缀 + } else if (singleCharHit.isPrefix()) { // 首字为量词前缀 // 前缀匹配则放入hit列表 this.countHits.add(singleCharHit); } - } else { // 输入的不是中文字符 // 清空未成形的量词 this.countHits.clear(); } - // 缓冲区数据已经读完,还有尚未输出的量词 if (context.isBufferConsumed()) { // 清空未成形的量词 @@ -208,6 +189,7 @@ class CN_QuantifierSegmenter implements ISegmenter { /** * 判断是否需要扫描量词 + * * @return */ private boolean needCountScan(AnalyzeContext context) { @@ -230,16 +212,15 @@ class CN_QuantifierSegmenter implements ISegmenter { /** * 添加数词词元到结果集 + * * @param context */ private void outputNumLexeme(AnalyzeContext context) { if (nStart > -1 && nEnd > -1) { // 输出数词 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, - Lexeme.TYPE_CNUM); + Lexeme newLexeme = + new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM); context.addLexeme(newLexeme); - } } - } diff --git a/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java b/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java index 4a57832..cf367ee 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java +++ b/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java @@ -1,34 +1,27 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * - * 字符集识别工具类 + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio + * + *

字符集识别工具类 */ package com.rymcu.forest.lucene.core; -/** - * - * 字符集识别工具类 - */ +/** 字符集识别工具类 */ class CharacterUtil { public static final int CHAR_USELESS = 0; @@ -43,6 +36,7 @@ class CharacterUtil { /** * 识别字符类型 + * * @param input * @return int CharacterUtil定义的字符类型常量 */ @@ -72,7 +66,6 @@ class CharacterUtil { || ub == Character.UnicodeBlock.KATAKANA // 片假名 || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) { return CHAR_OTHER_CJK; - } } // 其他的不做处理的字符 @@ -81,6 +74,7 @@ class CharacterUtil { /** * 进行字符规格化(全角转半角,大写转小写处理) + * * @param input * @return char */ diff --git a/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java b/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java index be8cc36..401242e 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java +++ b/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java @@ -1,44 +1,36 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio */ package com.rymcu.forest.lucene.core; import java.util.Stack; import java.util.TreeSet; -/** - * IK分词歧义裁决器 - */ +/** IK分词歧义裁决器 */ class IKArbitrator { - IKArbitrator() { - - } + IKArbitrator() {} /** * 分词歧义处理 - * @param orgLexemes + * + * @param context * @param useSmart */ void process(AnalyzeContext context, boolean useSmart) { @@ -84,9 +76,10 @@ class IKArbitrator { /** * 歧义识别 + * * @param lexemeCell 歧义路径链表头 * @param fullTextLength 歧义路径文本长度 - * @param option 候选结果路径 + * @param fullTextLength 候选结果路径 * @return */ private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) { @@ -114,12 +107,12 @@ class IKArbitrator { // 返回集合中的最优方案 return pathOptions.first(); - } /** * 向前遍历,添加词元,构造一个无歧义词元组合 - * @param LexemePath path + * + * @param option path * @return */ private Stack forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) { @@ -139,14 +132,13 @@ class IKArbitrator { /** * 回滚词元链,直到它能够接受指定的词元 - * @param lexeme + * + * @param option * @param l */ private void backPath(Lexeme l, LexemePath option) { while (option.checkCross(l)) { option.removeTail(); } - } - } diff --git a/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java index 6841cdd..9054c91 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java +++ b/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java @@ -1,25 +1,21 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio */ package com.rymcu.forest.lucene.core; @@ -32,30 +28,26 @@ import java.io.Reader; import java.util.ArrayList; import java.util.List; -/** - * IK分词器主类 - * - */ +/** IK分词器主类 */ public final class IKSegmenter { - // 字符窜reader + /** 字符窜reader */ private Reader input; - // 分词器配置项 + /** 分词器配置项 */ private Configuration cfg; - // 分词器上下文 + /** 分词器上下文 */ private AnalyzeContext context; - // 分词处理器列表 + /** 分词处理器列表 */ private List segmenters; - // 分词歧义裁决器 + /** 分词歧义裁决器 */ private IKArbitrator arbitrator; /** * IK分词器构造函数 - * @param input + * + * @param input * @param useSmart 为true,使用智能分词策略 - * - * 非智能分词:细粒度输出所有可能的切分结果 - * 智能分词: 合并数词和量词,对分词结果进行歧义判断 + *

非智能分词:细粒度输出所有可能的切分结果 智能分词: 合并数词和量词,对分词结果进行歧义判断 */ public IKSegmenter(Reader input, boolean useSmart) { this.input = input; @@ -66,9 +58,9 @@ public final class IKSegmenter { /** * IK分词器构造函数 + * * @param input * @param cfg 使用自定义的Configuration构造分词器 - * */ public IKSegmenter(Reader input, Configuration cfg) { this.input = input; @@ -76,9 +68,7 @@ public final class IKSegmenter { this.init(); } - /** - * 初始化 - */ + /** 初始化 */ private void init() { // 初始化词典单例 Dictionary.initial(this.cfg); @@ -92,6 +82,7 @@ public final class IKSegmenter { /** * 初始化词典,加载子分词器实现 + * * @return List */ private List loadSegmenters() { @@ -107,6 +98,7 @@ public final class IKSegmenter { /** * 分词,获取下一个词元 + * * @return Lexeme 词元对象 * @throws IOException */ @@ -152,9 +144,10 @@ public final class IKSegmenter { } /** - * 重置分词器到初始状态 - * @param input - */ + * 重置分词器到初始状态 + * + * @param input + */ public synchronized void reset(Reader input) { this.input = input; context.reset(); diff --git a/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java index 7c6bc4e..0a53aff 100755 --- a/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java +++ b/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java @@ -1,72 +1,46 @@ /** - * IK 中文分词 版本 5.0 - * IK Analyzer release 5.0 - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * IK 中文分词 版本 5.0 IK Analyzer release 5.0 * - * http://www.apache.org/licenses/LICENSE-2.0 + *

Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and * limitations under the License. * - * 源代码由林良益(linliangyi2005@gmail.com)提供 - * 版权声明 2012,乌龙茶工作室 - * provided by Linliangyi and copyright 2012 by Oolong studio - * + *

源代码由林良益(linliangyi2005@gmail.com)提供 版权声明 2012,乌龙茶工作室 provided by Linliangyi and copyright 2012 + * by Oolong studio */ package com.rymcu.forest.lucene.core; import java.util.Arrays; -/** - * - * 英文字符及阿拉伯数字子分词器 - */ +/** 英文字符及阿拉伯数字子分词器 */ class LetterSegmenter implements ISegmenter { - // 子分词器标签 + /** 子分词器标签 */ static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; - // 链接符号 - private static final char[] Letter_Connector = new char[] { '#', '&', '+', '-', '.', '@', '_' }; - - // 数字符号 - private static final char[] Num_Connector = new char[] { ',', '.' }; - - /* - * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 - */ + /** 链接符号 */ + private static final char[] Letter_Connector = new char[] {'#', '&', '+', '-', '.', '@', '_'}; + /** 数字符号 */ + private static final char[] Num_Connector = new char[] {',', '.'}; + /** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */ private int start; - /* - * 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 - */ + /** 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 */ private int end; - - /* - * 字母起始位置 - */ + /** 字母起始位置 */ private int englishStart; - - /* - * 字母结束位置 - */ + /** 字母结束位置 */ private int englishEnd; - - /* - * 阿拉伯数字起始位置 - */ + /** 阿拉伯数字起始位置 */ private int arabicStart; - - /* - * 阿拉伯数字结束位置 - */ + /** 阿拉伯数字结束位置 */ private int arabicEnd; LetterSegmenter() { @@ -80,10 +54,6 @@ class LetterSegmenter implements ISegmenter { this.arabicEnd = -1; } - /* - * (non-Javadoc) - * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) - */ @Override public void analyze(AnalyzeContext context) { boolean bufferLockFlag = false; @@ -103,10 +73,6 @@ class LetterSegmenter implements ISegmenter { } } - /* - * (non-Javadoc) - * @see org.wltea.analyzer.core.ISegmenter#reset() - */ @Override public void reset() { this.start = -1; @@ -118,16 +84,15 @@ class LetterSegmenter implements ISegmenter { } /** - * 处理数字字母混合输出 - * 如:windos2000 | linliangyi2005@gmail.com - * @param input + * 处理数字字母混合输出 如:windos2000 | linliangyi2005@gmail.com + * * @param context * @return */ private boolean processMixLetter(AnalyzeContext context) { boolean needLock = false; - if (this.start == -1) {// 当前的分词器尚未开始处理字符 + if (this.start == -1) { // 当前的分词器尚未开始处理字符 if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { // 记录起始指针的位置,标明分词器进入处理状态 @@ -135,7 +100,7 @@ class LetterSegmenter implements ISegmenter { this.end = start; } - } else {// 当前的分词器正在处理字符 + } else { // 当前的分词器正在处理字符 if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { // 记录下可能的结束位置 @@ -147,8 +112,12 @@ class LetterSegmenter implements ISegmenter { this.end = context.getCursor(); } else { // 遇到非Letter字符,输出词元 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start - + 1, Lexeme.TYPE_LETTER); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + this.start, + this.end - this.start + 1, + Lexeme.TYPE_LETTER); context.addLexeme(newLexeme); this.start = -1; this.end = -1; @@ -159,8 +128,12 @@ class LetterSegmenter implements ISegmenter { if (context.isBufferConsumed()) { if (this.start != -1 && this.end != -1) { // 缓冲以读完,输出词元 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start - + 1, Lexeme.TYPE_LETTER); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + this.start, + this.end - this.start + 1, + Lexeme.TYPE_LETTER); context.addLexeme(newLexeme); this.start = -1; this.end = -1; @@ -168,37 +141,38 @@ class LetterSegmenter implements ISegmenter { } // 判断是否锁定缓冲区 - if (this.start == -1 && this.end == -1) { - // 对缓冲区解锁 - needLock = false; - } else { - needLock = true; - } + // 对缓冲区解锁 + needLock = this.start != -1 || this.end != -1; return needLock; } /** * 处理纯英文字母输出 + * * @param context * @return */ private boolean processEnglishLetter(AnalyzeContext context) { boolean needLock = false; - if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符 + if (this.englishStart == -1) { // 当前的分词器尚未开始处理英文字符 if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { // 记录起始指针的位置,标明分词器进入处理状态 this.englishStart = context.getCursor(); this.englishEnd = this.englishStart; } - } else {// 当前的分词器正在处理英文字符 + } else { // 当前的分词器正在处理英文字符 if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { // 记录当前指针位置为结束位置 this.englishEnd = context.getCursor(); } else { // 遇到非English字符,输出词元 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - - this.englishStart + 1, Lexeme.TYPE_ENGLISH); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + this.englishStart, + this.englishEnd - this.englishStart + 1, + Lexeme.TYPE_ENGLISH); context.addLexeme(newLexeme); this.englishStart = -1; this.englishEnd = -1; @@ -209,8 +183,12 @@ class LetterSegmenter implements ISegmenter { if (context.isBufferConsumed()) { if (this.englishStart != -1 && this.englishEnd != -1) { // 缓冲以读完,输出词元 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd - - this.englishStart + 1, Lexeme.TYPE_ENGLISH); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + this.englishStart, + this.englishEnd - this.englishStart + 1, + Lexeme.TYPE_ENGLISH); context.addLexeme(newLexeme); this.englishStart = -1; this.englishEnd = -1; @@ -218,30 +196,27 @@ class LetterSegmenter implements ISegmenter { } // 判断是否锁定缓冲区 - if (this.englishStart == -1 && this.englishEnd == -1) { - // 对缓冲区解锁 - needLock = false; - } else { - needLock = true; - } + // 对缓冲区解锁 + needLock = this.englishStart != -1 || this.englishEnd != -1; return needLock; } /** * 处理阿拉伯数字输出 + * * @param context * @return */ private boolean processArabicLetter(AnalyzeContext context) { boolean needLock = false; - if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符 + if (this.arabicStart == -1) { // 当前的分词器尚未开始处理数字字符 if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) { // 记录起始指针的位置,标明分词器进入处理状态 this.arabicStart = context.getCursor(); this.arabicEnd = this.arabicStart; } - } else {// 当前的分词器正在处理数字字符 + } else { // 当前的分词器正在处理数字字符 if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) { // 记录当前指针位置为结束位置 this.arabicEnd = context.getCursor(); @@ -250,8 +225,12 @@ class LetterSegmenter implements ISegmenter { // 不输出数字,但不标记结束 } else { // //遇到非Arabic字符,输出词元 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - - this.arabicStart + 1, Lexeme.TYPE_ARABIC); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + this.arabicStart, + this.arabicEnd - this.arabicStart + 1, + Lexeme.TYPE_ARABIC); context.addLexeme(newLexeme); this.arabicStart = -1; this.arabicEnd = -1; @@ -262,8 +241,12 @@ class LetterSegmenter implements ISegmenter { if (context.isBufferConsumed()) { if (this.arabicStart != -1 && this.arabicEnd != -1) { // 生成已切分的词元 - Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd - - this.arabicStart + 1, Lexeme.TYPE_ARABIC); + Lexeme newLexeme = + new Lexeme( + context.getBufferOffset(), + this.arabicStart, + this.arabicEnd - this.arabicStart + 1, + Lexeme.TYPE_ARABIC); context.addLexeme(newLexeme); this.arabicStart = -1; this.arabicEnd = -1; @@ -271,17 +254,14 @@ class LetterSegmenter implements ISegmenter { } // 判断是否锁定缓冲区 - if (this.arabicStart == -1 && this.arabicEnd == -1) { - // 对缓冲区解锁 - needLock = false; - } else { - needLock = true; - } + // 对缓冲区解锁 + needLock = this.arabicStart != -1 || this.arabicEnd != -1; return needLock; } /** * 判断是否是字母连接符号 + * * @param input * @return */ @@ -292,6 +272,7 @@ class LetterSegmenter implements ISegmenter { /** * 判断是否是数字连接符号 + * * @param input * @return */ diff --git a/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java b/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java index 979daf8..a40ffa3 100755 --- a/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java +++ b/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java @@ -28,33 +28,22 @@ import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.List; -/** 词典管理类,单子模式 */ +/** 词典管理类,单例模式 */ public class Dictionary { - /* - * 词典单子实例 - */ + /** 词典单例 */ private static Dictionary singleton; - - /* - * 主词典对象 - */ + /** 主词典对象 */ private DictSegment _MainDict; - - /* - * 停止词词典 - */ + /** 停止词词典 */ private DictSegment _StopWordDict; - /* - * 量词词典 - */ + /** 量词词典 */ private DictSegment _QuantifierDict; - + /** 用户自定义词典路径 */ private static final String PATH_USER_DIC = System.getProperty("user.dir") + "/lucene/userDic/userDic.dic"; - /** 配置对象 */ - private Configuration cfg; + private final Configuration cfg; private Dictionary(Configuration cfg) { this.cfg = cfg; @@ -190,7 +179,7 @@ public class Dictionary { InputStream is = resource.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512); - String theWord = null; + String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { @@ -210,7 +199,7 @@ public class Dictionary { // 加载扩展词典配置 List extDictFiles = cfg.getExtDictionary(); if (extDictFiles != null) { - InputStream is = null; + InputStream is; for (String extDictName : extDictFiles) { // 读取扩展词典文件 System.out.println("加载扩展词典:" + extDictName); @@ -224,8 +213,9 @@ public class Dictionary { } } try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; + BufferedReader br = + new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512); + String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { @@ -234,17 +224,12 @@ public class Dictionary { _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - } catch (IOException ioe) { System.err.println("Extension Dictionary loading exception."); ioe.printStackTrace(); - } finally { try { - if (is != null) { - is.close(); - is = null; - } + is.close(); } catch (IOException e) { e.printStackTrace(); } @@ -271,26 +256,20 @@ public class Dictionary { } try { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; + String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { - // System.out.println(theWord); // 加载扩展停止词典数据到内存中 _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - } catch (IOException ioe) { System.err.println("Extension Stop word Dictionary loading exception."); ioe.printStackTrace(); - } finally { try { - if (is != null) { - is.close(); - is = null; - } + is.close(); } catch (IOException e) { e.printStackTrace(); } @@ -310,25 +289,21 @@ public class Dictionary { throw new RuntimeException("Quantifier Dictionary not found!!!"); } try { - BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); - String theWord = null; + BufferedReader br = + new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512); + String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); } } while (theWord != null); - } catch (IOException ioe) { System.err.println("Quantifier Dictionary loading exception."); ioe.printStackTrace(); - } finally { try { - if (is != null) { - is.close(); - is = null; - } + is.close(); } catch (IOException e) { e.printStackTrace(); } @@ -349,7 +324,7 @@ public class Dictionary { try { BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512); - String theWord = null; + String theWord; do { theWord = br.readLine(); if (theWord != null && !"".equals(theWord.trim())) { @@ -364,7 +339,6 @@ public class Dictionary { } finally { try { is.close(); - is = null; } catch (IOException e) { e.printStackTrace(); } diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java b/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java index f586e9b..c8d1ea0 100644 --- a/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java +++ b/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java @@ -12,7 +12,7 @@ import java.util.List; import java.util.concurrent.CountDownLatch; /** - * BaiKeBeanIndex + * ArticleBeanIndex * * @author suwen * @date 2021/2/2 14:10 diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java b/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java index 1998079..b6d703f 100644 --- a/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java +++ b/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java @@ -9,119 +9,120 @@ import java.text.ParseException; import java.util.List; import java.util.concurrent.CountDownLatch; - /** * BaseIndex - * + * * @author suwen * @date 2021/2/2 14:14 */ -public abstract class BaseIndex implements Runnable{ - /** - * 父级索引路径 - */ - private String parentIndexPath; - /** - * 索引编写器 - */ - private IndexWriter writer; - private int subIndex; - /** - * 主线程 - */ - private final CountDownLatch countDownLatch1; - /** - *工作线程 - */ - private final CountDownLatch countDownLatch2; - /** - * 对象列表 - */ - private List list; - public BaseIndex(IndexWriter writer,CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, - List list){ - super(); - this.writer = writer; - this.countDownLatch1 = countDownLatch1; - this.countDownLatch2 = countDownLatch2; - this.list = list; - } - public BaseIndex(String parentIndexPath, int subIndex, - CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, - List list) { - super(); - this.parentIndexPath = parentIndexPath; - this.subIndex = subIndex; - try { - //多目录索引创建 - File file = new File(parentIndexPath+"/index"+subIndex); - if(!file.exists()){ - file.mkdir(); - } - this.writer = IndexUtil.getIndexWriter(parentIndexPath+"/index"+subIndex, true); - } catch (IOException e) { - e.printStackTrace(); - }; - this.subIndex = subIndex; - this.countDownLatch1 = countDownLatch1; - this.countDownLatch2 = countDownLatch2; - this.list = list; - } - public BaseIndex(String path,CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, - List list) { - super(); - try { - //单目录索引创建 - File file = new File(path); - if(!file.exists()){ - file.mkdir(); - } - this.writer = IndexUtil.getIndexWriter(path,true); - } catch (IOException e) { - e.printStackTrace(); - }; - this.countDownLatch1 = countDownLatch1; - this.countDownLatch2 = countDownLatch2; - this.list = list; - } +public abstract class BaseIndex implements Runnable { + /** 父级索引路径 */ + private String parentIndexPath; + /** 索引编写器 */ + private IndexWriter writer; - /**创建索引 - * @param writer - * @throws IOException - * @throws ParseException - */ - public abstract void indexDoc(IndexWriter writer,T t) throws Exception; - /**批量索引创建 - * @param writer - * @param t - * @throws Exception - */ - public void indexDocs(IndexWriter writer,List t) throws Exception{ - for (T t2 : t) { - indexDoc(writer,t2); - } + private int subIndex; + /** 主线程 */ + private final CountDownLatch countDownLatch1; + /** 工作线程 */ + private final CountDownLatch countDownLatch2; + /** 对象列表 */ + private List list; + + public BaseIndex( + IndexWriter writer, + CountDownLatch countDownLatch1, + CountDownLatch countDownLatch2, + List list) { + super(); + this.writer = writer; + this.countDownLatch1 = countDownLatch1; + this.countDownLatch2 = countDownLatch2; + this.list = list; + } + + public BaseIndex( + String parentIndexPath, + int subIndex, + CountDownLatch countDownLatch1, + CountDownLatch countDownLatch2, + List list) { + super(); + this.parentIndexPath = parentIndexPath; + this.subIndex = subIndex; + try { + // 多目录索引创建 + File file = new File(parentIndexPath + "/index" + subIndex); + if (!file.exists()) { + file.mkdir(); + } + this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true); + } catch (IOException e) { + e.printStackTrace(); } + ; + this.subIndex = subIndex; + this.countDownLatch1 = countDownLatch1; + this.countDownLatch2 = countDownLatch2; + this.list = list; + } - @Override - public void run() { - try { - countDownLatch1.await(); - System.out.println(writer); - indexDocs(writer,list); - } catch (InterruptedException e) { - e.printStackTrace(); - } catch (Exception e) { - e.printStackTrace(); - }finally{ - countDownLatch2.countDown(); - try { - writer.commit(); - writer.close(); - } catch (IOException e) { - e.printStackTrace(); - } - - } - + public BaseIndex( + String path, CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, List list) { + super(); + try { + // 单目录索引创建 + File file = new File(path); + if (!file.exists()) { + file.mkdir(); + } + this.writer = IndexUtil.getIndexWriter(path, true); + } catch (IOException e) { + e.printStackTrace(); } + ; + this.countDownLatch1 = countDownLatch1; + this.countDownLatch2 = countDownLatch2; + this.list = list; + } + + /** + * 创建索引 + * + * @param writer + * @throws IOException + * @throws ParseException + */ + public abstract void indexDoc(IndexWriter writer, T t) throws Exception; + /** + * 批量索引创建 + * + * @param writer + * @param t + * @throws Exception + */ + public void indexDocs(IndexWriter writer, List t) throws Exception { + for (T t2 : t) { + indexDoc(writer, t2); + } + } + + @Override + public void run() { + try { + countDownLatch1.await(); + System.out.println(writer); + indexDocs(writer, list); + } catch (Exception e) { + e.printStackTrace(); + } finally { + countDownLatch2.countDown(); + try { + writer.commit(); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } } diff --git a/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java b/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java index b4a344e..5587560 100644 --- a/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java +++ b/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java @@ -22,9 +22,6 @@ public class UserDicServiceImpl implements UserDicService { @Resource private UserDicMapper userDicMapper; - /** Lucene索引文件路径 */ - private final String dicPath = System.getProperty("user.dir") + "/lucene/userDic/userDic.dic"; - @Override public List getAllDic() { @@ -56,8 +53,12 @@ public class UserDicServiceImpl implements UserDicService { private void writeUserDic() { try { - File file = new File(dicPath); - FileOutputStream stream = new FileOutputStream(file, false); + String filePath = "lucene/userDic/"; + File file = new File(filePath); + if (!file.exists()) { + file.mkdirs(); + } + FileOutputStream stream = new FileOutputStream(file + "/userDic.dic", false); OutputStreamWriter outfw = new OutputStreamWriter(stream, StandardCharsets.UTF_8); PrintWriter fw = new PrintWriter(new BufferedWriter(outfw)); userDicMapper @@ -70,7 +71,7 @@ public class UserDicServiceImpl implements UserDicService { fw.flush(); fw.close(); Dictionary.getSingleton().updateUserDict(); - } catch (FileNotFoundException e) { + } catch (IOException e) { e.printStackTrace(); } } diff --git a/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java b/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java index 3f46fa4..be66434 100644 --- a/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java +++ b/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java @@ -28,7 +28,7 @@ public class IndexUtil { * @throws IOException */ public static IndexWriter getIndexWriter(String indexPath,boolean create) throws IOException{ - Directory dir = FSDirectory.open(Paths.get(indexPath, new String[0])); + Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new IKAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); LogMergePolicy mergePolicy = new LogByteSizeMergePolicy(); diff --git a/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java b/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java index b0d46f0..3f84d08 100644 --- a/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java +++ b/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java @@ -39,7 +39,7 @@ public class SearchUtil { IndexReader[] readers = new IndexReader[files.length]; for (int i = 0; i < files.length; i++) { readers[i] = - DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath(), new String[0]))); + DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath()))); } reader = new MultiReader(readers); } catch (IOException e) { @@ -55,7 +55,7 @@ public class SearchUtil { * @throws IOException */ public static DirectoryReader getIndexReader(String indexPath) throws IOException { - return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath, new String[0]))); + return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); } /** * 根据索引路径获取IndexSearcher