map = new HashMap<>(2);
+ map.put("userDic", pageInfo.getList());
+ Map pagination = Utils.getPagination(pageInfo);
+ map.put("pagination", pagination);
+ return GlobalResultGenerator.genSuccessResult(map);
+ }
- @PostMapping("/addDic/{dic}")
- public GlobalResult addDic(@PathVariable String dic) {
- dicService.addDic(dic);
- return GlobalResultGenerator.genSuccessResult("新增字典成功");
- }
+ @PostMapping("/addDic/{dic}")
+ public GlobalResult addDic(@PathVariable String dic) {
+ dicService.addDic(dic);
+ return GlobalResultGenerator.genSuccessResult("新增字典成功");
+ }
- @PutMapping("/editDic")
- public GlobalResult getAllDic(@RequestBody UserDic dic) {
- dicService.updateDic(dic);
- return GlobalResultGenerator.genSuccessResult("更新字典成功");
- }
+ @PutMapping("/editDic")
+ public GlobalResult getAllDic(@RequestBody UserDic dic) {
+ dicService.updateDic(dic);
+ return GlobalResultGenerator.genSuccessResult("更新字典成功");
+ }
- @DeleteMapping("/deleteDic/{id}")
- public GlobalResult deleteDic(@PathVariable String id) {
- dicService.deleteDic(id);
- return GlobalResultGenerator.genSuccessResult("删除字典成功");
- }
+ @DeleteMapping("/deleteDic/{id}")
+ public GlobalResult deleteDic(@PathVariable String id) {
+ dicService.deleteDic(id);
+ return GlobalResultGenerator.genSuccessResult("删除字典成功");
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/cfg/Configuration.java b/src/main/java/com/rymcu/forest/lucene/cfg/Configuration.java
index 21d7236..b7022a3 100644
--- a/src/main/java/com/rymcu/forest/lucene/cfg/Configuration.java
+++ b/src/main/java/com/rymcu/forest/lucene/cfg/Configuration.java
@@ -1,75 +1,74 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.cfg;
import java.util.List;
/**
- *
+ *
* 配置管理类接口
- *
+ *
*/
public interface Configuration {
- /**
- * 返回useSmart标志位
- * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
- * @return useSmart
- */
- boolean useSmart();
+ /**
+ * 返回useSmart标志位
+ * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
+ * @return useSmart
+ */
+ boolean useSmart();
- /**
- * 设置useSmart标志位
- * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
- * @param useSmart
- */
- void setUseSmart(boolean useSmart);
+ /**
+ * 设置useSmart标志位
+ * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
+ * @param useSmart
+ */
+ void setUseSmart(boolean useSmart);
- /**
- * 获取主词典路径
- *
- * @return String 主词典路径
- */
- String getMainDictionary();
+ /**
+ * 获取主词典路径
+ *
+ * @return String 主词典路径
+ */
+ String getMainDictionary();
- /**
- * 获取量词词典路径
- * @return String 量词词典路径
- */
- String getQuantifierDictionary();
+ /**
+ * 获取量词词典路径
+ * @return String 量词词典路径
+ */
+ String getQuantifierDictionary();
- /**
- * 获取扩展字典配置路径
- * @return List 相对类加载器的路径
- */
- List getExtDictionary();
+ /**
+ * 获取扩展字典配置路径
+ * @return List 相对类加载器的路径
+ */
+ List getExtDictionary();
- /**
- * 获取扩展停止词典配置路径
- * @return List 相对类加载器的路径
- */
- List getExtStopWordDictionary();
+ /**
+ * 获取扩展停止词典配置路径
+ * @return List 相对类加载器的路径
+ */
+ List getExtStopWordDictionary();
}
diff --git a/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java b/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java
index d8dd9be..92c2312 100644
--- a/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java
+++ b/src/main/java/com/rymcu/forest/lucene/cfg/DefaultConfig.java
@@ -24,110 +24,124 @@ import org.springframework.stereotype.Component;
import java.util.ArrayList;
import java.util.List;
-/** Configuration 默认实现 2012-5-8 */
+/**
+ * Configuration 默认实现 2012-5-8
+ */
@Component
public class DefaultConfig implements Configuration {
- /** 分词器默认字典路径 */
- private static final String PATH_DIC_MAIN = "lucene/main2012.dic";
- /** 题词字典路径 */
- private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic";
- /** 用户自定义字典路径 */
- private static final String PATH_USER_DIC =
- System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
- /** 配置属性——扩展字典 */
- private String extDic = "lucene/ext.dic;" + PATH_USER_DIC;
- /** 配置属性——扩展停止词典 */
- private String extStopword = "lucene/stopword.dic";
- /** 是否使用smart方式分词 */
- private boolean useSmart;
+ /**
+ * 分词器默认字典路径
+ */
+ private static final String PATH_DIC_MAIN = "lucene/main2012.dic";
+ /**
+ * 题词字典路径
+ */
+ private static final String PATH_DIC_QUANTIFIER = "lucene/quantifier.dic";
+ /**
+ * 用户自定义字典路径
+ */
+ private static final String PATH_USER_DIC =
+ System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
+ /**
+ * 配置属性——扩展字典
+ */
+ private String extDic = "lucene/ext.dic;" + PATH_USER_DIC;
+ /**
+ * 配置属性——扩展停止词典
+ */
+ private String extStopword = "lucene/stopword.dic";
+ /**
+ * 是否使用smart方式分词
+ */
+ private boolean useSmart;
- /**
- * 返回单例
- *
- * @return Configuration单例
- */
- public static Configuration getInstance() {
- return new DefaultConfig();
- }
-
- /**
- * 返回useSmart标志位 useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
- *
- * @return useSmart
- */
- @Override
- public boolean useSmart() {
- return useSmart;
- }
-
- /**
- * 设置useSmart标志位 useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
- *
- * @param useSmart
- */
- @Override
- public void setUseSmart(boolean useSmart) {
- this.useSmart = useSmart;
- }
-
- /**
- * 获取主词典路径
- *
- * @return String 主词典路径
- */
- @Override
- public String getMainDictionary() {
- return PATH_DIC_MAIN;
- }
-
- /**
- * 获取量词词典路径
- *
- * @return String 量词词典路径
- */
- @Override
- public String getQuantifierDictionary() {
- return PATH_DIC_QUANTIFIER;
- }
-
- /**
- * 获取扩展字典配置路径
- *
- * @return List 相对类加载器的路径
- */
- @Override
- public List getExtDictionary() {
- List extDictFiles = new ArrayList(2);
- if (extDic != null) {
- // 使用;分割多个扩展字典配置
- String[] filePaths = extDic.split(";");
- for (String filePath : filePaths) {
- if (filePath != null && !"".equals(filePath.trim())) {
- extDictFiles.add(filePath.trim());
- }
- }
+ /**
+ * 返回单例
+ *
+ * @return Configuration单例
+ */
+ public static Configuration getInstance() {
+ return new DefaultConfig();
}
- return extDictFiles;
- }
- /**
- * 获取扩展停止词典配置路径
- *
- * @return List 相对类加载器的路径
- */
- @Override
- public List getExtStopWordDictionary() {
- List extStopWordDictFiles = new ArrayList<>(2);
- if (extStopword != null) {
- // 使用;分割多个扩展字典配置
- String[] filePaths = extStopword.split(";");
- for (String filePath : filePaths) {
- if (filePath != null && !"".equals(filePath.trim())) {
- extStopWordDictFiles.add(filePath.trim());
- }
- }
+ /**
+ * 返回useSmart标志位 useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
+ *
+ * @return useSmart
+ */
+ @Override
+ public boolean useSmart() {
+ return useSmart;
+ }
+
+ /**
+ * 设置useSmart标志位 useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
+ *
+ * @param useSmart
+ */
+ @Override
+ public void setUseSmart(boolean useSmart) {
+ this.useSmart = useSmart;
+ }
+
+ /**
+ * 获取主词典路径
+ *
+ * @return String 主词典路径
+ */
+ @Override
+ public String getMainDictionary() {
+ return PATH_DIC_MAIN;
+ }
+
+ /**
+ * 获取量词词典路径
+ *
+ * @return String 量词词典路径
+ */
+ @Override
+ public String getQuantifierDictionary() {
+ return PATH_DIC_QUANTIFIER;
+ }
+
+ /**
+ * 获取扩展字典配置路径
+ *
+ * @return List 相对类加载器的路径
+ */
+ @Override
+ public List getExtDictionary() {
+ List extDictFiles = new ArrayList(2);
+ if (extDic != null) {
+ // 使用;分割多个扩展字典配置
+ String[] filePaths = extDic.split(";");
+ for (String filePath : filePaths) {
+ if (filePath != null && !"".equals(filePath.trim())) {
+ extDictFiles.add(filePath.trim());
+ }
+ }
+ }
+ return extDictFiles;
+ }
+
+ /**
+ * 获取扩展停止词典配置路径
+ *
+ * @return List 相对类加载器的路径
+ */
+ @Override
+ public List getExtStopWordDictionary() {
+ List extStopWordDictFiles = new ArrayList<>(2);
+ if (extStopword != null) {
+ // 使用;分割多个扩展字典配置
+ String[] filePaths = extStopword.split(";");
+ for (String filePath : filePaths) {
+ if (filePath != null && !"".equals(filePath.trim())) {
+ extStopWordDictFiles.add(filePath.trim());
+ }
+ }
+ }
+ return extStopWordDictFiles;
}
- return extStopWordDictFiles;
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java b/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java
index 0450695..5ff4fcf 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/AnalyzeContext.java
@@ -26,328 +26,364 @@ import java.io.IOException;
import java.io.Reader;
import java.util.*;
-/** 分词器上下文状态 */
+/**
+ * 分词器上下文状态
+ */
class AnalyzeContext {
- /** 默认缓冲区大小 */
- private static final int BUFF_SIZE = 4096;
- /** 缓冲区耗尽的临界值 */
- private static final int BUFF_EXHAUST_CRITICAL = 100;
- /** 字符窜读取缓冲 */
- private char[] segmentBuff;
- /** 字符类型数组 */
- private int[] charTypes;
- /** 记录Reader内已分析的字串总长度, 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 */
- private int buffOffset;
- /** 当前缓冲区位置指针 */
- private int cursor;
- /** 最近一次读入的,可处理的字串长度 */
- private int available;
- /** 子分词器锁, 该集合非空,说明有子分词器在占用segmentBuff */
- private final Set buffLocker;
- /** 原始分词结果集合,未经歧义处理 */
- private QuickSortSet orgLexemes;
- /** LexemePath位置索引表 */
- private final Map pathMap;
- /** 最终分词结果集 */
- private final LinkedList results;
- /** 分词器配置项 */
- private final Configuration cfg;
+ /**
+ * 默认缓冲区大小
+ */
+ private static final int BUFF_SIZE = 4096;
+ /**
+ * 缓冲区耗尽的临界值
+ */
+ private static final int BUFF_EXHAUST_CRITICAL = 100;
+ /**
+ * 字符窜读取缓冲
+ */
+ private char[] segmentBuff;
+ /**
+ * 字符类型数组
+ */
+ private int[] charTypes;
+ /**
+ * 记录Reader内已分析的字串总长度, 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
+ */
+ private int buffOffset;
+ /**
+ * 当前缓冲区位置指针
+ */
+ private int cursor;
+ /**
+ * 最近一次读入的,可处理的字串长度
+ */
+ private int available;
+ /**
+ * 子分词器锁, 该集合非空,说明有子分词器在占用segmentBuff
+ */
+ private final Set buffLocker;
+ /**
+ * 原始分词结果集合,未经歧义处理
+ */
+ private QuickSortSet orgLexemes;
+ /**
+ * LexemePath位置索引表
+ */
+ private final Map pathMap;
+ /**
+ * 最终分词结果集
+ */
+ private final LinkedList results;
+ /**
+ * 分词器配置项
+ */
+ private final Configuration cfg;
- public AnalyzeContext(Configuration cfg) {
- this.cfg = cfg;
- this.segmentBuff = new char[BUFF_SIZE];
- this.charTypes = new int[BUFF_SIZE];
- this.buffLocker = new HashSet<>();
- this.orgLexemes = new QuickSortSet();
- this.pathMap = new HashMap<>();
- this.results = new LinkedList<>();
- }
-
- int getCursor() {
- return this.cursor;
- }
-
- char[] getSegmentBuff() {
- return this.segmentBuff;
- }
-
- char getCurrentChar() {
- return this.segmentBuff[this.cursor];
- }
-
- int getCurrentCharType() {
- return this.charTypes[this.cursor];
- }
-
- int getBufferOffset() {
- return this.buffOffset;
- }
-
- /**
- * 根据context的上下文情况,填充segmentBuff
- *
- * @param reader
- * @return 返回待分析的(有效的)字串长度
- * @throws IOException
- */
- int fillBuffer(Reader reader) throws IOException {
- int readCount = 0;
- if (this.buffOffset == 0) {
- // 首次读取reader
- readCount = reader.read(segmentBuff);
- } else {
- int offset = this.available - this.cursor;
- if (offset > 0) {
- // 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
- System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
- readCount = offset;
- }
- // 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
- readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
+ public AnalyzeContext(Configuration cfg) {
+ this.cfg = cfg;
+ this.segmentBuff = new char[BUFF_SIZE];
+ this.charTypes = new int[BUFF_SIZE];
+ this.buffLocker = new HashSet<>();
+ this.orgLexemes = new QuickSortSet();
+ this.pathMap = new HashMap<>();
+ this.results = new LinkedList<>();
}
- // 记录最后一次从Reader中读入的可用字符长度
- this.available = readCount;
- // 重置当前指针
- this.cursor = 0;
- return readCount;
- }
- /** 初始化buff指针,处理第一个字符 */
- void initCursor() {
- this.cursor = 0;
- this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
- this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
- }
-
- /** 指针+1 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 并处理当前字符 */
- boolean moveCursor() {
- if (this.cursor < this.available - 1) {
- this.cursor++;
- this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
- this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
- return true;
- } else {
- return false;
+ int getCursor() {
+ return this.cursor;
}
- }
- /**
- * 设置当前segmentBuff为锁定状态 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
- *
- * @param segmenterName
- */
- void lockBuffer(String segmenterName) {
- this.buffLocker.add(segmenterName);
- }
-
- /**
- * 移除指定的子分词器名,释放对segmentBuff的占用
- *
- * @param segmenterName
- */
- void unlockBuffer(String segmenterName) {
- this.buffLocker.remove(segmenterName);
- }
-
- /**
- * 只要buffLocker中存在segmenterName 则buffer被锁定
- *
- * @return boolean 缓冲去是否被锁定
- */
- boolean isBufferLocked() {
- return this.buffLocker.size() > 0;
- }
-
- /**
- * 判断当前segmentBuff是否已经用完 当前执针cursor移至segmentBuff末端this.available - 1
- *
- * @return
- */
- boolean isBufferConsumed() {
- return this.cursor == this.available - 1;
- }
-
- /**
- * 判断segmentBuff是否需要读取新数据
- *
- * 满足一下条件时, 1.available == BUFF_SIZE 表示buffer满载 2.buffIndex < available - 1 && buffIndex >
- * available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 3.!context.isBufferLocked()表示没有segmenter在占用buffer
- * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
- *
- * @return
- */
- boolean needRefillBuffer() {
- return this.available == BUFF_SIZE
- && this.cursor < this.available - 1
- && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
- && !this.isBufferLocked();
- }
-
- /** 累计当前的segmentBuff相对于reader起始位置的位移 */
- void markBufferOffset() {
- this.buffOffset += this.cursor;
- }
-
- /**
- * 向分词结果集添加词元
- *
- * @param lexeme
- */
- void addLexeme(Lexeme lexeme) {
- this.orgLexemes.addLexeme(lexeme);
- }
-
- /**
- * 添加分词结果路径 路径起始位置 ---> 路径 映射表
- *
- * @param path
- */
- void addLexemePath(LexemePath path) {
- if (path != null) {
- this.pathMap.put(path.getPathBegin(), path);
+ char[] getSegmentBuff() {
+ return this.segmentBuff;
}
- }
- /**
- * 返回原始分词结果
- *
- * @return
- */
- QuickSortSet getOrgLexemes() {
- return this.orgLexemes;
- }
+ char getCurrentChar() {
+ return this.segmentBuff[this.cursor];
+ }
- /**
- * 推送分词结果到结果集合 1.从buff头部遍历到this.cursor已处理位置 2.将map中存在的分词结果推入results
- * 3.将map中不存在的CJDK字符以单字方式推入results
- */
- void outputToResult() {
- int index = 0;
- while (index <= this.cursor) {
- // 跳过非CJK字符
- if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
- index++;
- continue;
- }
- // 从pathMap找出对应index位置的LexemePath
- LexemePath path = this.pathMap.get(index);
- if (path != null) {
- // 输出LexemePath中的lexeme到results集合
- Lexeme l = path.pollFirst();
- while (l != null) {
- this.results.add(l);
- // 将index移至lexeme后
- index = l.getBegin() + l.getLength();
- l = path.pollFirst();
- if (l != null) {
- // 输出path内部,词元间遗漏的单字
- for (; index < l.getBegin(); index++) {
- this.outputSingleCJK(index);
+ int getCurrentCharType() {
+ return this.charTypes[this.cursor];
+ }
+
+ int getBufferOffset() {
+ return this.buffOffset;
+ }
+
+ /**
+ * 根据context的上下文情况,填充segmentBuff
+ *
+ * @param reader
+ * @return 返回待分析的(有效的)字串长度
+ * @throws IOException
+ */
+ int fillBuffer(Reader reader) throws IOException {
+ int readCount = 0;
+ if (this.buffOffset == 0) {
+ // 首次读取reader
+ readCount = reader.read(segmentBuff);
+ } else {
+ int offset = this.available - this.cursor;
+ if (offset > 0) {
+ // 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
+ System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
+ readCount = offset;
}
- }
+ // 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
+ readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
}
- } else { // pathMap中找不到index对应的LexemePath
- // 单字输出
- this.outputSingleCJK(index);
- index++;
- }
+ // 记录最后一次从Reader中读入的可用字符长度
+ this.available = readCount;
+ // 重置当前指针
+ this.cursor = 0;
+ return readCount;
}
- // 清空当前的Map
- this.pathMap.clear();
- }
- /**
- * 对CJK字符进行单字输出
- *
- * @param index
- */
- private void outputSingleCJK(int index) {
- if (CharacterUtil.CHAR_CHINESE == this.charTypes[index]) {
- Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_CNCHAR);
- this.results.add(singleCharLexeme);
- } else if (CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]) {
- Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_OTHER_CJK);
- this.results.add(singleCharLexeme);
+ /**
+ * 初始化buff指针,处理第一个字符
+ */
+ void initCursor() {
+ this.cursor = 0;
+ this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
+ this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
}
- }
- /**
- * 返回lexeme
- *
- *
同时处理合并
- *
- * @return
- */
- Lexeme getNextLexeme() {
- // 从结果集取出,并移除第一个Lexme
- Lexeme result = this.results.pollFirst();
- while (result != null) {
- // 数量词合并
- this.compound(result);
- if (Dictionary.getSingleton()
- .isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
- // 是停止词继续取列表的下一个
- result = this.results.pollFirst();
- } else {
- // 不是停止词, 生成lexeme的词元文本,输出
- result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
- break;
- }
- }
- return result;
- }
-
- /** 重置分词上下文状态 */
- void reset() {
- this.buffLocker.clear();
- this.orgLexemes = new QuickSortSet();
- this.available = 0;
- this.buffOffset = 0;
- this.charTypes = new int[BUFF_SIZE];
- this.cursor = 0;
- this.results.clear();
- this.segmentBuff = new char[BUFF_SIZE];
- this.pathMap.clear();
- }
-
- /** 组合词元 */
- private void compound(Lexeme result) {
- if (!this.cfg.useSmart()) {
- return;
- }
- // 数量词合并处理
- if (!this.results.isEmpty()) {
-
- if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
- Lexeme nextLexeme = this.results.peekFirst();
- boolean appendOk = false;
- if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
- // 合并英文数词+中文数词
- appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
- } else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
- // 合并英文数词+中文量词
- appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
+ /**
+ * 指针+1 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 并处理当前字符
+ */
+ boolean moveCursor() {
+ if (this.cursor < this.available - 1) {
+ this.cursor++;
+ this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
+ this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * 设置当前segmentBuff为锁定状态 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
+ *
+ * @param segmenterName
+ */
+ void lockBuffer(String segmenterName) {
+ this.buffLocker.add(segmenterName);
+ }
+
+ /**
+ * 移除指定的子分词器名,释放对segmentBuff的占用
+ *
+ * @param segmenterName
+ */
+ void unlockBuffer(String segmenterName) {
+ this.buffLocker.remove(segmenterName);
+ }
+
+ /**
+ * 只要buffLocker中存在segmenterName 则buffer被锁定
+ *
+ * @return boolean 缓冲去是否被锁定
+ */
+ boolean isBufferLocked() {
+ return this.buffLocker.size() > 0;
+ }
+
+ /**
+ * 判断当前segmentBuff是否已经用完 当前执针cursor移至segmentBuff末端this.available - 1
+ *
+ * @return
+ */
+ boolean isBufferConsumed() {
+ return this.cursor == this.available - 1;
+ }
+
+ /**
+ * 判断segmentBuff是否需要读取新数据
+ *
+ *
满足一下条件时, 1.available == BUFF_SIZE 表示buffer满载 2.buffIndex < available - 1 && buffIndex >
+ * available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 3.!context.isBufferLocked()表示没有segmenter在占用buffer
+ * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
+ *
+ * @return
+ */
+ boolean needRefillBuffer() {
+ return this.available == BUFF_SIZE
+ && this.cursor < this.available - 1
+ && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
+ && !this.isBufferLocked();
+ }
+
+ /**
+ * 累计当前的segmentBuff相对于reader起始位置的位移
+ */
+ void markBufferOffset() {
+ this.buffOffset += this.cursor;
+ }
+
+ /**
+ * 向分词结果集添加词元
+ *
+ * @param lexeme
+ */
+ void addLexeme(Lexeme lexeme) {
+ this.orgLexemes.addLexeme(lexeme);
+ }
+
+ /**
+ * 添加分词结果路径 路径起始位置 ---> 路径 映射表
+ *
+ * @param path
+ */
+ void addLexemePath(LexemePath path) {
+ if (path != null) {
+ this.pathMap.put(path.getPathBegin(), path);
+ }
+ }
+
+ /**
+ * 返回原始分词结果
+ *
+ * @return
+ */
+ QuickSortSet getOrgLexemes() {
+ return this.orgLexemes;
+ }
+
+ /**
+ * 推送分词结果到结果集合 1.从buff头部遍历到this.cursor已处理位置 2.将map中存在的分词结果推入results
+ * 3.将map中不存在的CJDK字符以单字方式推入results
+ */
+ void outputToResult() {
+ int index = 0;
+ while (index <= this.cursor) {
+ // 跳过非CJK字符
+ if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
+ index++;
+ continue;
+ }
+ // 从pathMap找出对应index位置的LexemePath
+ LexemePath path = this.pathMap.get(index);
+ if (path != null) {
+ // 输出LexemePath中的lexeme到results集合
+ Lexeme l = path.pollFirst();
+ while (l != null) {
+ this.results.add(l);
+ // 将index移至lexeme后
+ index = l.getBegin() + l.getLength();
+ l = path.pollFirst();
+ if (l != null) {
+ // 输出path内部,词元间遗漏的单字
+ for (; index < l.getBegin(); index++) {
+ this.outputSingleCJK(index);
+ }
+ }
+ }
+ } else { // pathMap中找不到index对应的LexemePath
+ // 单字输出
+ this.outputSingleCJK(index);
+ index++;
+ }
+ }
+ // 清空当前的Map
+ this.pathMap.clear();
+ }
+
+ /**
+ * 对CJK字符进行单字输出
+ *
+ * @param index
+ */
+ private void outputSingleCJK(int index) {
+ if (CharacterUtil.CHAR_CHINESE == this.charTypes[index]) {
+ Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_CNCHAR);
+ this.results.add(singleCharLexeme);
+ } else if (CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]) {
+ Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_OTHER_CJK);
+ this.results.add(singleCharLexeme);
+ }
+ }
+
+ /**
+ * 返回lexeme
+ *
+ *
同时处理合并
+ *
+ * @return
+ */
+ Lexeme getNextLexeme() {
+ // 从结果集取出,并移除第一个Lexme
+ Lexeme result = this.results.pollFirst();
+ while (result != null) {
+ // 数量词合并
+ this.compound(result);
+ if (Dictionary.getSingleton()
+ .isStopWord(this.segmentBuff, result.getBegin(), result.getLength())) {
+ // 是停止词继续取列表的下一个
+ result = this.results.pollFirst();
+ } else {
+ // 不是停止词, 生成lexeme的词元文本,输出
+ result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
+ break;
+ }
+ }
+ return result;
+ }
+
+ /**
+ * 重置分词上下文状态
+ */
+ void reset() {
+ this.buffLocker.clear();
+ this.orgLexemes = new QuickSortSet();
+ this.available = 0;
+ this.buffOffset = 0;
+ this.charTypes = new int[BUFF_SIZE];
+ this.cursor = 0;
+ this.results.clear();
+ this.segmentBuff = new char[BUFF_SIZE];
+ this.pathMap.clear();
+ }
+
+ /**
+ * 组合词元
+ */
+ private void compound(Lexeme result) {
+ if (!this.cfg.useSmart()) {
+ return;
+ }
+ // 数量词合并处理
+ if (!this.results.isEmpty()) {
+
+ if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
+ Lexeme nextLexeme = this.results.peekFirst();
+ boolean appendOk = false;
+ if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
+ // 合并英文数词+中文数词
+ appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
+ } else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
+ // 合并英文数词+中文量词
+ appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
+ }
+ if (appendOk) {
+ // 弹出
+ this.results.pollFirst();
+ }
+ }
+
+ // 可能存在第二轮合并
+ if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
+ Lexeme nextLexeme = this.results.peekFirst();
+ boolean appendOk = false;
+ if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
+ // 合并中文数词+中文量词
+ appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
+ }
+ if (appendOk) {
+ // 弹出
+ this.results.pollFirst();
+ }
+ }
}
- if (appendOk) {
- // 弹出
- this.results.pollFirst();
- }
- }
-
- // 可能存在第二轮合并
- if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
- Lexeme nextLexeme = this.results.peekFirst();
- boolean appendOk = false;
- if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
- // 合并中文数词+中文量词
- appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
- }
- if (appendOk) {
- // 弹出
- this.results.pollFirst();
- }
- }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java
index 948beca..8fd8c64 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/CJKSegmenter.java
@@ -25,95 +25,101 @@ import com.rymcu.forest.lucene.dic.Hit;
import java.util.LinkedList;
import java.util.List;
-/** 中文-日韩文子分词器 */
+/**
+ * 中文-日韩文子分词器
+ */
class CJKSegmenter implements ISegmenter {
- /** 子分词器标签 */
- static final String SEGMENTER_NAME = "CJK_SEGMENTER";
- /** 待处理的分词hit队列 */
- private List tmpHits;
+ /**
+ * 子分词器标签
+ */
+ static final String SEGMENTER_NAME = "CJK_SEGMENTER";
+ /**
+ * 待处理的分词hit队列
+ */
+ private List tmpHits;
- CJKSegmenter() {
- this.tmpHits = new LinkedList();
- }
+ CJKSegmenter() {
+ this.tmpHits = new LinkedList();
+ }
- @Override
- public void analyze(AnalyzeContext context) {
- if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
+ @Override
+ public void analyze(AnalyzeContext context) {
+ if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
- // 优先处理tmpHits中的hit
- if (!this.tmpHits.isEmpty()) {
- // 处理词段队列
- Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
- for (Hit hit : tmpArray) {
- hit =
- Dictionary.getSingleton()
- .matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
- if (hit.isMatch()) {
- // 输出当前的词
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- hit.getBegin(),
- context.getCursor() - hit.getBegin() + 1,
- Lexeme.TYPE_CNWORD);
- context.addLexeme(newLexeme);
+ // 优先处理tmpHits中的hit
+ if (!this.tmpHits.isEmpty()) {
+ // 处理词段队列
+ Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
+ for (Hit hit : tmpArray) {
+ hit =
+ Dictionary.getSingleton()
+ .matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
+ if (hit.isMatch()) {
+ // 输出当前的词
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ hit.getBegin(),
+ context.getCursor() - hit.getBegin() + 1,
+ Lexeme.TYPE_CNWORD);
+ context.addLexeme(newLexeme);
- if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除
- this.tmpHits.remove(hit);
+ if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除
+ this.tmpHits.remove(hit);
+ }
+
+ } else if (hit.isUnmatch()) {
+ // hit不是词,移除
+ this.tmpHits.remove(hit);
+ }
+ }
}
- } else if (hit.isUnmatch()) {
- // hit不是词,移除
- this.tmpHits.remove(hit);
- }
+ // 再对当前指针位置的字符进行单字匹配
+ Hit singleCharHit =
+ Dictionary.getSingleton()
+ .matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
+ if (singleCharHit.isMatch()) { // 首字成词
+ // 输出当前的词
+ Lexeme newLexeme =
+ new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
+ context.addLexeme(newLexeme);
+
+ // 同时也是词前缀
+ if (singleCharHit.isPrefix()) {
+ // 前缀匹配则放入hit列表
+ this.tmpHits.add(singleCharHit);
+ }
+ } else if (singleCharHit.isPrefix()) { // 首字为词前缀
+ // 前缀匹配则放入hit列表
+ this.tmpHits.add(singleCharHit);
+ }
+
+ } else {
+ // 遇到CHAR_USELESS字符
+ // 清空队列
+ this.tmpHits.clear();
}
- }
- // 再对当前指针位置的字符进行单字匹配
- Hit singleCharHit =
- Dictionary.getSingleton()
- .matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
- if (singleCharHit.isMatch()) { // 首字成词
- // 输出当前的词
- Lexeme newLexeme =
- new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_CNWORD);
- context.addLexeme(newLexeme);
-
- // 同时也是词前缀
- if (singleCharHit.isPrefix()) {
- // 前缀匹配则放入hit列表
- this.tmpHits.add(singleCharHit);
+ // 判断缓冲区是否已经读完
+ if (context.isBufferConsumed()) {
+ // 清空队列
+ this.tmpHits.clear();
}
- } else if (singleCharHit.isPrefix()) { // 首字为词前缀
- // 前缀匹配则放入hit列表
- this.tmpHits.add(singleCharHit);
- }
- } else {
- // 遇到CHAR_USELESS字符
- // 清空队列
- this.tmpHits.clear();
+ // 判断是否锁定缓冲区
+ if (this.tmpHits.size() == 0) {
+ context.unlockBuffer(SEGMENTER_NAME);
+
+ } else {
+ context.lockBuffer(SEGMENTER_NAME);
+ }
}
- // 判断缓冲区是否已经读完
- if (context.isBufferConsumed()) {
- // 清空队列
- this.tmpHits.clear();
+ @Override
+ public void reset() {
+ // 清空队列
+ this.tmpHits.clear();
}
-
- // 判断是否锁定缓冲区
- if (this.tmpHits.size() == 0) {
- context.unlockBuffer(SEGMENTER_NAME);
-
- } else {
- context.lockBuffer(SEGMENTER_NAME);
- }
- }
-
- @Override
- public void reset() {
- // 清空队列
- this.tmpHits.clear();
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java
index cc0227d..30a2314 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/CN_QuantifierSegmenter.java
@@ -27,200 +27,218 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Set;
-/** 中文数量词子分词器 */
+/**
+ * 中文数量词子分词器
+ */
class CN_QuantifierSegmenter implements ISegmenter {
- /** 子分词器标签 */
- static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
+ /**
+ * 子分词器标签
+ */
+ static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
- /** 中文数词 */
- private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
+ /**
+ * 中文数词
+ */
+ private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";
- private static Set ChnNumberChars = new HashSet<>();
+ private static Set ChnNumberChars = new HashSet<>();
- static {
- char[] ca = Chn_Num.toCharArray();
- for (char nChar : ca) {
- ChnNumberChars.add(nChar);
+ static {
+ char[] ca = Chn_Num.toCharArray();
+ for (char nChar : ca) {
+ ChnNumberChars.add(nChar);
+ }
}
- }
- /** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
- private int nStart;
- /** 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束 */
- private int nEnd;
+ /**
+ * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
+ */
+ private int nStart;
+ /**
+ * 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束
+ */
+ private int nEnd;
- /** 待处理的量词hit队列 */
- private final List countHits;
+ /**
+ * 待处理的量词hit队列
+ */
+ private final List countHits;
- CN_QuantifierSegmenter() {
- nStart = -1;
- nEnd = -1;
- this.countHits = new LinkedList();
- }
-
- /** 分词 */
- @Override
- public void analyze(AnalyzeContext context) {
- // 处理中文数词
- this.processCNumber(context);
- // 处理中文量词
- this.processCount(context);
- // 判断是否锁定缓冲区
- if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
- // 对缓冲区解锁
- context.unlockBuffer(SEGMENTER_NAME);
- } else {
- context.lockBuffer(SEGMENTER_NAME);
- }
- }
-
- /** 重置子分词器状态 */
- @Override
- public void reset() {
- nStart = -1;
- nEnd = -1;
- countHits.clear();
- }
-
- /** 处理数词 */
- private void processCNumber(AnalyzeContext context) {
- if (nStart == -1 && nEnd == -1) { // 初始状态
- if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
- && ChnNumberChars.contains(context.getCurrentChar())) {
- // 记录数词的起始、结束位置
- nStart = context.getCursor();
- nEnd = context.getCursor();
- }
- } else { // 正在处理状态
- if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
- && ChnNumberChars.contains(context.getCurrentChar())) {
- // 记录数词的结束位置
- nEnd = context.getCursor();
- } else {
- // 输出数词
- this.outputNumLexeme(context);
- // 重置头尾指针
+ CN_QuantifierSegmenter() {
nStart = -1;
nEnd = -1;
- }
+ this.countHits = new LinkedList();
}
- // 缓冲区已经用完,还有尚未输出的数词
- if (context.isBufferConsumed()) {
- if (nStart != -1 && nEnd != -1) {
- // 输出数词
- outputNumLexeme(context);
- // 重置头尾指针
+ /**
+ * 分词
+ */
+ @Override
+ public void analyze(AnalyzeContext context) {
+ // 处理中文数词
+ this.processCNumber(context);
+ // 处理中文量词
+ this.processCount(context);
+ // 判断是否锁定缓冲区
+ if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
+ // 对缓冲区解锁
+ context.unlockBuffer(SEGMENTER_NAME);
+ } else {
+ context.lockBuffer(SEGMENTER_NAME);
+ }
+ }
+
+ /**
+ * 重置子分词器状态
+ */
+ @Override
+ public void reset() {
nStart = -1;
nEnd = -1;
- }
+ countHits.clear();
}
- }
- /**
- * 处理中文量词
- *
- * @param context
- */
- private void processCount(AnalyzeContext context) {
- // 判断是否需要启动量词扫描
- if (!this.needCountScan(context)) {
- return;
- }
- if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
- // 优先处理countHits中的hit
- if (!this.countHits.isEmpty()) {
- // 处理词段队列
- Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
- for (Hit hit : tmpArray) {
- hit =
- Dictionary.getSingleton()
- .matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
- if (hit.isMatch()) {
- // 输出当前的词
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- hit.getBegin(),
- context.getCursor() - hit.getBegin() + 1,
- Lexeme.TYPE_COUNT);
- context.addLexeme(newLexeme);
-
- if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除
- this.countHits.remove(hit);
+ /**
+ * 处理数词
+ */
+ private void processCNumber(AnalyzeContext context) {
+ if (nStart == -1 && nEnd == -1) { // 初始状态
+ if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
+ && ChnNumberChars.contains(context.getCurrentChar())) {
+ // 记录数词的起始、结束位置
+ nStart = context.getCursor();
+ nEnd = context.getCursor();
+ }
+ } else { // 正在处理状态
+ if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
+ && ChnNumberChars.contains(context.getCurrentChar())) {
+ // 记录数词的结束位置
+ nEnd = context.getCursor();
+ } else {
+ // 输出数词
+ this.outputNumLexeme(context);
+ // 重置头尾指针
+ nStart = -1;
+ nEnd = -1;
}
-
- } else if (hit.isUnmatch()) {
- // hit不是词,移除
- this.countHits.remove(hit);
- }
}
- }
- // 对当前指针位置的字符进行单字匹配
- Hit singleCharHit =
- Dictionary.getSingleton()
- .matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
- if (singleCharHit.isMatch()) { // 首字成量词词
- // 输出当前的词
- Lexeme newLexeme =
- new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
- context.addLexeme(newLexeme);
- // 同时也是词前缀
- if (singleCharHit.isPrefix()) {
- // 前缀匹配则放入hit列表
- this.countHits.add(singleCharHit);
- }
- } else if (singleCharHit.isPrefix()) { // 首字为量词前缀
- // 前缀匹配则放入hit列表
- this.countHits.add(singleCharHit);
- }
- } else {
- // 输入的不是中文字符
- // 清空未成形的量词
- this.countHits.clear();
- }
- // 缓冲区数据已经读完,还有尚未输出的量词
- if (context.isBufferConsumed()) {
- // 清空未成形的量词
- this.countHits.clear();
- }
- }
- /**
- * 判断是否需要扫描量词
- *
- * @return
- */
- private boolean needCountScan(AnalyzeContext context) {
- if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
- // 正在处理中文数词,或者正在处理量词
- return true;
- } else {
- // 找到一个相邻的数词
- if (!context.getOrgLexemes().isEmpty()) {
- Lexeme l = context.getOrgLexemes().peekLast();
- if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
- if (l.getBegin() + l.getLength() == context.getCursor()) {
+ // 缓冲区已经用完,还有尚未输出的数词
+ if (context.isBufferConsumed()) {
+ if (nStart != -1 && nEnd != -1) {
+ // 输出数词
+ outputNumLexeme(context);
+ // 重置头尾指针
+ nStart = -1;
+ nEnd = -1;
+ }
+ }
+ }
+
+ /**
+ * 处理中文量词
+ *
+ * @param context
+ */
+ private void processCount(AnalyzeContext context) {
+ // 判断是否需要启动量词扫描
+ if (!this.needCountScan(context)) {
+ return;
+ }
+ if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
+ // 优先处理countHits中的hit
+ if (!this.countHits.isEmpty()) {
+ // 处理词段队列
+ Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
+ for (Hit hit : tmpArray) {
+ hit =
+ Dictionary.getSingleton()
+ .matchWithHit(context.getSegmentBuff(), context.getCursor(), hit);
+ if (hit.isMatch()) {
+ // 输出当前的词
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ hit.getBegin(),
+ context.getCursor() - hit.getBegin() + 1,
+ Lexeme.TYPE_COUNT);
+ context.addLexeme(newLexeme);
+
+ if (!hit.isPrefix()) { // 不是词前缀,hit不需要继续匹配,移除
+ this.countHits.remove(hit);
+ }
+
+ } else if (hit.isUnmatch()) {
+ // hit不是词,移除
+ this.countHits.remove(hit);
+ }
+ }
+ }
+ // 对当前指针位置的字符进行单字匹配
+ Hit singleCharHit =
+ Dictionary.getSingleton()
+ .matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
+ if (singleCharHit.isMatch()) { // 首字成量词词
+ // 输出当前的词
+ Lexeme newLexeme =
+ new Lexeme(context.getBufferOffset(), context.getCursor(), 1, Lexeme.TYPE_COUNT);
+ context.addLexeme(newLexeme);
+ // 同时也是词前缀
+ if (singleCharHit.isPrefix()) {
+ // 前缀匹配则放入hit列表
+ this.countHits.add(singleCharHit);
+ }
+ } else if (singleCharHit.isPrefix()) { // 首字为量词前缀
+ // 前缀匹配则放入hit列表
+ this.countHits.add(singleCharHit);
+ }
+ } else {
+ // 输入的不是中文字符
+ // 清空未成形的量词
+ this.countHits.clear();
+ }
+ // 缓冲区数据已经读完,还有尚未输出的量词
+ if (context.isBufferConsumed()) {
+ // 清空未成形的量词
+ this.countHits.clear();
+ }
+ }
+
+ /**
+ * 判断是否需要扫描量词
+ *
+ * @return
+ */
+ private boolean needCountScan(AnalyzeContext context) {
+ if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
+ // 正在处理中文数词,或者正在处理量词
return true;
- }
+ } else {
+ // 找到一个相邻的数词
+ if (!context.getOrgLexemes().isEmpty()) {
+ Lexeme l = context.getOrgLexemes().peekLast();
+ if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
+ if (l.getBegin() + l.getLength() == context.getCursor()) {
+ return true;
+ }
+ }
+ }
}
- }
+ return false;
}
- return false;
- }
- /**
- * 添加数词词元到结果集
- *
- * @param context
- */
- private void outputNumLexeme(AnalyzeContext context) {
- if (nStart > -1 && nEnd > -1) {
- // 输出数词
- Lexeme newLexeme =
- new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
- context.addLexeme(newLexeme);
+ /**
+ * 添加数词词元到结果集
+ *
+ * @param context
+ */
+ private void outputNumLexeme(AnalyzeContext context) {
+ if (nStart > -1 && nEnd > -1) {
+ // 输出数词
+ Lexeme newLexeme =
+ new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, Lexeme.TYPE_CNUM);
+ context.addLexeme(newLexeme);
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java b/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java
index cf367ee..dd3da97 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/CharacterUtil.java
@@ -21,74 +21,76 @@
*/
package com.rymcu.forest.lucene.core;
-/** 字符集识别工具类 */
+/**
+ * 字符集识别工具类
+ */
class CharacterUtil {
- public static final int CHAR_USELESS = 0;
+ public static final int CHAR_USELESS = 0;
- public static final int CHAR_ARABIC = 0X00000001;
+ public static final int CHAR_ARABIC = 0X00000001;
- public static final int CHAR_ENGLISH = 0X00000002;
+ public static final int CHAR_ENGLISH = 0X00000002;
- public static final int CHAR_CHINESE = 0X00000004;
+ public static final int CHAR_CHINESE = 0X00000004;
- public static final int CHAR_OTHER_CJK = 0X00000008;
+ public static final int CHAR_OTHER_CJK = 0X00000008;
- /**
- * 识别字符类型
- *
- * @param input
- * @return int CharacterUtil定义的字符类型常量
- */
- static int identifyCharType(char input) {
- if (input >= '0' && input <= '9') {
- return CHAR_ARABIC;
+ /**
+ * 识别字符类型
+ *
+ * @param input
+ * @return int CharacterUtil定义的字符类型常量
+ */
+ static int identifyCharType(char input) {
+ if (input >= '0' && input <= '9') {
+ return CHAR_ARABIC;
- } else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')) {
- return CHAR_ENGLISH;
+ } else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')) {
+ return CHAR_ENGLISH;
- } else {
- Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
+ } else {
+ Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
- if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
- || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
- || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
- // 目前已知的中文字符UTF-8集合
- return CHAR_CHINESE;
+ if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
+ || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
+ || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
+ // 目前已知的中文字符UTF-8集合
+ return CHAR_CHINESE;
- } else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS // 全角数字字符和日韩字符
- // 韩文字符集
- || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
- || ub == Character.UnicodeBlock.HANGUL_JAMO
- || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
- // 日文字符集
- || ub == Character.UnicodeBlock.HIRAGANA // 平假名
- || ub == Character.UnicodeBlock.KATAKANA // 片假名
- || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
- return CHAR_OTHER_CJK;
- }
- }
- // 其他的不做处理的字符
- return CHAR_USELESS;
- }
-
- /**
- * 进行字符规格化(全角转半角,大写转小写处理)
- *
- * @param input
- * @return char
- */
- static char regularize(char input) {
- if (input == 12288) {
- input = (char) 32;
-
- } else if (input > 65280 && input < 65375) {
- input = (char) (input - 65248);
-
- } else if (input >= 'A' && input <= 'Z') {
- input += 32;
+ } else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS // 全角数字字符和日韩字符
+ // 韩文字符集
+ || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
+ || ub == Character.UnicodeBlock.HANGUL_JAMO
+ || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
+ // 日文字符集
+ || ub == Character.UnicodeBlock.HIRAGANA // 平假名
+ || ub == Character.UnicodeBlock.KATAKANA // 片假名
+ || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
+ return CHAR_OTHER_CJK;
+ }
+ }
+ // 其他的不做处理的字符
+ return CHAR_USELESS;
}
- return input;
- }
+ /**
+ * 进行字符规格化(全角转半角,大写转小写处理)
+ *
+ * @param input
+ * @return char
+ */
+ static char regularize(char input) {
+ if (input == 12288) {
+ input = (char) 32;
+
+ } else if (input > 65280 && input < 65375) {
+ input = (char) (input - 65248);
+
+ } else if (input >= 'A' && input <= 'Z') {
+ input += 32;
+ }
+
+ return input;
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java b/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java
index 401242e..a52fd1a 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/IKArbitrator.java
@@ -22,123 +22,126 @@ package com.rymcu.forest.lucene.core;
import java.util.Stack;
import java.util.TreeSet;
-/** IK分词歧义裁决器 */
+/**
+ * IK分词歧义裁决器
+ */
class IKArbitrator {
- IKArbitrator() {}
+ IKArbitrator() {
+ }
- /**
- * 分词歧义处理
- *
- * @param context
- * @param useSmart
- */
- void process(AnalyzeContext context, boolean useSmart) {
- QuickSortSet orgLexemes = context.getOrgLexemes();
- Lexeme orgLexeme = orgLexemes.pollFirst();
+ /**
+ * 分词歧义处理
+ *
+ * @param context
+ * @param useSmart
+ */
+ void process(AnalyzeContext context, boolean useSmart) {
+ QuickSortSet orgLexemes = context.getOrgLexemes();
+ Lexeme orgLexeme = orgLexemes.pollFirst();
- LexemePath crossPath = new LexemePath();
- while (orgLexeme != null) {
- if (!crossPath.addCrossLexeme(orgLexeme)) {
- // 找到与crossPath不相交的下一个crossPath
- if (crossPath.size() == 1 || !useSmart) {
- // crossPath没有歧义 或者 不做歧义处理
- // 直接输出当前crossPath
- context.addLexemePath(crossPath);
- } else {
- // 对当前的crossPath进行歧义处理
- QuickSortSet.Cell headCell = crossPath.getHead();
- LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
- // 输出歧义处理结果judgeResult
- context.addLexemePath(judgeResult);
+ LexemePath crossPath = new LexemePath();
+ while (orgLexeme != null) {
+ if (!crossPath.addCrossLexeme(orgLexeme)) {
+ // 找到与crossPath不相交的下一个crossPath
+ if (crossPath.size() == 1 || !useSmart) {
+ // crossPath没有歧义 或者 不做歧义处理
+ // 直接输出当前crossPath
+ context.addLexemePath(crossPath);
+ } else {
+ // 对当前的crossPath进行歧义处理
+ QuickSortSet.Cell headCell = crossPath.getHead();
+ LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
+ // 输出歧义处理结果judgeResult
+ context.addLexemePath(judgeResult);
+ }
+
+ // 把orgLexeme加入新的crossPath中
+ crossPath = new LexemePath();
+ crossPath.addCrossLexeme(orgLexeme);
+ }
+ orgLexeme = orgLexemes.pollFirst();
}
- // 把orgLexeme加入新的crossPath中
- crossPath = new LexemePath();
- crossPath.addCrossLexeme(orgLexeme);
- }
- orgLexeme = orgLexemes.pollFirst();
+ // 处理最后的path
+ if (crossPath.size() == 1 || !useSmart) {
+ // crossPath没有歧义 或者 不做歧义处理
+ // 直接输出当前crossPath
+ context.addLexemePath(crossPath);
+ } else {
+ // 对当前的crossPath进行歧义处理
+ QuickSortSet.Cell headCell = crossPath.getHead();
+ LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
+ // 输出歧义处理结果judgeResult
+ context.addLexemePath(judgeResult);
+ }
}
- // 处理最后的path
- if (crossPath.size() == 1 || !useSmart) {
- // crossPath没有歧义 或者 不做歧义处理
- // 直接输出当前crossPath
- context.addLexemePath(crossPath);
- } else {
- // 对当前的crossPath进行歧义处理
- QuickSortSet.Cell headCell = crossPath.getHead();
- LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
- // 输出歧义处理结果judgeResult
- context.addLexemePath(judgeResult);
- }
- }
+ /**
+ * 歧义识别
+ *
+ * @param lexemeCell 歧义路径链表头
+ * @param fullTextLength 歧义路径文本长度
+ * @param fullTextLength 候选结果路径
+ * @return
+ */
+ private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
+ // 候选路径集合
+ TreeSet pathOptions = new TreeSet();
+ // 候选结果路径
+ LexemePath option = new LexemePath();
- /**
- * 歧义识别
- *
- * @param lexemeCell 歧义路径链表头
- * @param fullTextLength 歧义路径文本长度
- * @param fullTextLength 候选结果路径
- * @return
- */
- private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
- // 候选路径集合
- TreeSet pathOptions = new TreeSet();
- // 候选结果路径
- LexemePath option = new LexemePath();
+ // 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
+ Stack lexemeStack = this.forwardPath(lexemeCell, option);
- // 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
- Stack lexemeStack = this.forwardPath(lexemeCell, option);
+ // 当前词元链并非最理想的,加入候选路径集合
+ pathOptions.add(option.copy());
- // 当前词元链并非最理想的,加入候选路径集合
- pathOptions.add(option.copy());
+ // 存在歧义词,处理
+ QuickSortSet.Cell c = null;
+ while (!lexemeStack.isEmpty()) {
+ c = lexemeStack.pop();
+ // 回滚词元链
+ this.backPath(c.getLexeme(), option);
+ // 从歧义词位置开始,递归,生成可选方案
+ this.forwardPath(c, option);
+ pathOptions.add(option.copy());
+ }
- // 存在歧义词,处理
- QuickSortSet.Cell c = null;
- while (!lexemeStack.isEmpty()) {
- c = lexemeStack.pop();
- // 回滚词元链
- this.backPath(c.getLexeme(), option);
- // 从歧义词位置开始,递归,生成可选方案
- this.forwardPath(c, option);
- pathOptions.add(option.copy());
+ // 返回集合中的最优方案
+ return pathOptions.first();
}
- // 返回集合中的最优方案
- return pathOptions.first();
- }
-
- /**
- * 向前遍历,添加词元,构造一个无歧义词元组合
- *
- * @param option path
- * @return
- */
- private Stack forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
- // 发生冲突的Lexeme栈
- Stack conflictStack = new Stack();
- QuickSortSet.Cell c = lexemeCell;
- // 迭代遍历Lexeme链表
- while (c != null && c.getLexeme() != null) {
- if (!option.addNotCrossLexeme(c.getLexeme())) {
- // 词元交叉,添加失败则加入lexemeStack栈
- conflictStack.push(c);
- }
- c = c.getNext();
+ /**
+ * 向前遍历,添加词元,构造一个无歧义词元组合
+ *
+ * @param option path
+ * @return
+ */
+ private Stack forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
+ // 发生冲突的Lexeme栈
+ Stack conflictStack = new Stack();
+ QuickSortSet.Cell c = lexemeCell;
+ // 迭代遍历Lexeme链表
+ while (c != null && c.getLexeme() != null) {
+ if (!option.addNotCrossLexeme(c.getLexeme())) {
+ // 词元交叉,添加失败则加入lexemeStack栈
+ conflictStack.push(c);
+ }
+ c = c.getNext();
+ }
+ return conflictStack;
}
- return conflictStack;
- }
- /**
- * 回滚词元链,直到它能够接受指定的词元
- *
- * @param option
- * @param l
- */
- private void backPath(Lexeme l, LexemePath option) {
- while (option.checkCross(l)) {
- option.removeTail();
+ /**
+ * 回滚词元链,直到它能够接受指定的词元
+ *
+ * @param option
+ * @param l
+ */
+ private void backPath(Lexeme l, LexemePath option) {
+ while (option.checkCross(l)) {
+ option.removeTail();
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java
index 9054c91..8321252 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/IKSegmenter.java
@@ -28,131 +28,145 @@ import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
-/** IK分词器主类 */
+/**
+ * IK分词器主类
+ */
public final class IKSegmenter {
- /** 字符窜reader */
- private Reader input;
- /** 分词器配置项 */
- private Configuration cfg;
- /** 分词器上下文 */
- private AnalyzeContext context;
- /** 分词处理器列表 */
- private List segmenters;
- /** 分词歧义裁决器 */
- private IKArbitrator arbitrator;
+ /**
+ * 字符窜reader
+ */
+ private Reader input;
+ /**
+ * 分词器配置项
+ */
+ private Configuration cfg;
+ /**
+ * 分词器上下文
+ */
+ private AnalyzeContext context;
+ /**
+ * 分词处理器列表
+ */
+ private List segmenters;
+ /**
+ * 分词歧义裁决器
+ */
+ private IKArbitrator arbitrator;
- /**
- * IK分词器构造函数
- *
- * @param input
- * @param useSmart 为true,使用智能分词策略
- * 非智能分词:细粒度输出所有可能的切分结果 智能分词: 合并数词和量词,对分词结果进行歧义判断
- */
- public IKSegmenter(Reader input, boolean useSmart) {
- this.input = input;
- this.cfg = DefaultConfig.getInstance();
- this.cfg.setUseSmart(useSmart);
- this.init();
- }
+ /**
+ * IK分词器构造函数
+ *
+ * @param input
+ * @param useSmart 为true,使用智能分词策略
+ *
非智能分词:细粒度输出所有可能的切分结果 智能分词: 合并数词和量词,对分词结果进行歧义判断
+ */
+ public IKSegmenter(Reader input, boolean useSmart) {
+ this.input = input;
+ this.cfg = DefaultConfig.getInstance();
+ this.cfg.setUseSmart(useSmart);
+ this.init();
+ }
- /**
- * IK分词器构造函数
- *
- * @param input
- * @param cfg 使用自定义的Configuration构造分词器
- */
- public IKSegmenter(Reader input, Configuration cfg) {
- this.input = input;
- this.cfg = cfg;
- this.init();
- }
+ /**
+ * IK分词器构造函数
+ *
+ * @param input
+ * @param cfg 使用自定义的Configuration构造分词器
+ */
+ public IKSegmenter(Reader input, Configuration cfg) {
+ this.input = input;
+ this.cfg = cfg;
+ this.init();
+ }
- /** 初始化 */
- private void init() {
- // 初始化词典单例
- Dictionary.initial(this.cfg);
- // 初始化分词上下文
- this.context = new AnalyzeContext(this.cfg);
- // 加载子分词器
- this.segmenters = this.loadSegmenters();
- // 加载歧义裁决器
- this.arbitrator = new IKArbitrator();
- }
+ /**
+ * 初始化
+ */
+ private void init() {
+ // 初始化词典单例
+ Dictionary.initial(this.cfg);
+ // 初始化分词上下文
+ this.context = new AnalyzeContext(this.cfg);
+ // 加载子分词器
+ this.segmenters = this.loadSegmenters();
+ // 加载歧义裁决器
+ this.arbitrator = new IKArbitrator();
+ }
- /**
- * 初始化词典,加载子分词器实现
- *
- * @return List
- */
- private List loadSegmenters() {
- List segmenters = new ArrayList(4);
- // 处理字母的子分词器
- segmenters.add(new LetterSegmenter());
- // 处理中文数量词的子分词器
- segmenters.add(new CN_QuantifierSegmenter());
- // 处理中文词的子分词器
- segmenters.add(new CJKSegmenter());
- return segmenters;
- }
+ /**
+ * 初始化词典,加载子分词器实现
+ *
+ * @return List
+ */
+ private List loadSegmenters() {
+ List segmenters = new ArrayList(4);
+ // 处理字母的子分词器
+ segmenters.add(new LetterSegmenter());
+ // 处理中文数量词的子分词器
+ segmenters.add(new CN_QuantifierSegmenter());
+ // 处理中文词的子分词器
+ segmenters.add(new CJKSegmenter());
+ return segmenters;
+ }
- /**
- * 分词,获取下一个词元
- *
- * @return Lexeme 词元对象
- * @throws IOException
- */
- public synchronized Lexeme next() throws IOException {
- Lexeme l = null;
- while ((l = context.getNextLexeme()) == null) {
- /*
- * 从reader中读取数据,填充buffer 如果reader是分次读入buffer的,那么buffer要 进行移位处理 移位处理上次读入的但未处理的数据
- */
- int available = context.fillBuffer(this.input);
- if (available <= 0) {
- // reader已经读完
- context.reset();
- return null;
+ /**
+ * 分词,获取下一个词元
+ *
+ * @return Lexeme 词元对象
+ * @throws IOException
+ */
+ public synchronized Lexeme next() throws IOException {
+ Lexeme l = null;
+ while ((l = context.getNextLexeme()) == null) {
+ /*
+ * 从reader中读取数据,填充buffer 如果reader是分次读入buffer的,那么buffer要 进行移位处理 移位处理上次读入的但未处理的数据
+ */
+ int available = context.fillBuffer(this.input);
+ if (available <= 0) {
+ // reader已经读完
+ context.reset();
+ return null;
- } else {
- // 初始化指针
- context.initCursor();
- do {
- // 遍历子分词器
- for (ISegmenter segmenter : segmenters) {
- segmenter.analyze(context);
- }
- // 字符缓冲区接近读完,需要读入新的字符
- if (context.needRefillBuffer()) {
- break;
- }
- // 向前移动指针
- } while (context.moveCursor());
- // 重置子分词器,为下轮循环进行初始化
- for (ISegmenter segmenter : segmenters) {
- segmenter.reset();
+ } else {
+ // 初始化指针
+ context.initCursor();
+ do {
+ // 遍历子分词器
+ for (ISegmenter segmenter : segmenters) {
+ segmenter.analyze(context);
+ }
+ // 字符缓冲区接近读完,需要读入新的字符
+ if (context.needRefillBuffer()) {
+ break;
+ }
+ // 向前移动指针
+ } while (context.moveCursor());
+ // 重置子分词器,为下轮循环进行初始化
+ for (ISegmenter segmenter : segmenters) {
+ segmenter.reset();
+ }
+ }
+ // 对分词进行歧义处理
+ this.arbitrator.process(context, this.cfg.useSmart());
+ // 将分词结果输出到结果集,并处理未切分的单个CJK字符
+ context.outputToResult();
+ // 记录本次分词的缓冲区位移
+ context.markBufferOffset();
}
- }
- // 对分词进行歧义处理
- this.arbitrator.process(context, this.cfg.useSmart());
- // 将分词结果输出到结果集,并处理未切分的单个CJK字符
- context.outputToResult();
- // 记录本次分词的缓冲区位移
- context.markBufferOffset();
+ return l;
}
- return l;
- }
- /**
- * 重置分词器到初始状态
- *
- * @param input
- */
- public synchronized void reset(Reader input) {
- this.input = input;
- context.reset();
- for (ISegmenter segmenter : segmenters) {
- segmenter.reset();
+ /**
+ * 重置分词器到初始状态
+ *
+ * @param input
+ */
+ public synchronized void reset(Reader input) {
+ this.input = input;
+ context.reset();
+ for (ISegmenter segmenter : segmenters) {
+ segmenter.reset();
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/ISegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/ISegmenter.java
index cf0e50a..043c7cf 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/ISegmenter.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/ISegmenter.java
@@ -1,44 +1,43 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.core;
/**
- *
+ *
* 子分词器接口
*/
interface ISegmenter {
- /**
- * 从分析器读取下一个可能分解的词元对象
- * @param context 分词算法上下文
- */
- void analyze(AnalyzeContext context);
+ /**
+ * 从分析器读取下一个可能分解的词元对象
+ * @param context 分词算法上下文
+ */
+ void analyze(AnalyzeContext context);
- /**
- * 重置子分析器状态
- */
- void reset();
+ /**
+ * 重置子分析器状态
+ */
+ void reset();
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java b/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java
index 0a53aff..38cec3e 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/LetterSegmenter.java
@@ -21,263 +21,283 @@ package com.rymcu.forest.lucene.core;
import java.util.Arrays;
-/** 英文字符及阿拉伯数字子分词器 */
+/**
+ * 英文字符及阿拉伯数字子分词器
+ */
class LetterSegmenter implements ISegmenter {
- /** 子分词器标签 */
- static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
- /** 链接符号 */
- private static final char[] Letter_Connector = new char[] {'#', '&', '+', '-', '.', '@', '_'};
- /** 数字符号 */
- private static final char[] Num_Connector = new char[] {',', '.'};
- /** 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 */
- private int start;
- /** 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 */
- private int end;
- /** 字母起始位置 */
- private int englishStart;
- /** 字母结束位置 */
- private int englishEnd;
- /** 阿拉伯数字起始位置 */
- private int arabicStart;
- /** 阿拉伯数字结束位置 */
- private int arabicEnd;
+ /**
+ * 子分词器标签
+ */
+ static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
+ /**
+ * 链接符号
+ */
+ private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_'};
+ /**
+ * 数字符号
+ */
+ private static final char[] Num_Connector = new char[]{',', '.'};
+ /**
+ * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
+ */
+ private int start;
+ /**
+ * 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
+ */
+ private int end;
+ /**
+ * 字母起始位置
+ */
+ private int englishStart;
+ /**
+ * 字母结束位置
+ */
+ private int englishEnd;
+ /**
+ * 阿拉伯数字起始位置
+ */
+ private int arabicStart;
+ /**
+ * 阿拉伯数字结束位置
+ */
+ private int arabicEnd;
- LetterSegmenter() {
- Arrays.sort(Letter_Connector);
- Arrays.sort(Num_Connector);
- this.start = -1;
- this.end = -1;
- this.englishStart = -1;
- this.englishEnd = -1;
- this.arabicStart = -1;
- this.arabicEnd = -1;
- }
-
- @Override
- public void analyze(AnalyzeContext context) {
- boolean bufferLockFlag = false;
- // 处理英文字母
- bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
- // 处理阿拉伯字母
- bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
- // 处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
- bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
-
- // 判断是否锁定缓冲区
- if (bufferLockFlag) {
- context.lockBuffer(SEGMENTER_NAME);
- } else {
- // 对缓冲区解锁
- context.unlockBuffer(SEGMENTER_NAME);
- }
- }
-
- @Override
- public void reset() {
- this.start = -1;
- this.end = -1;
- this.englishStart = -1;
- this.englishEnd = -1;
- this.arabicStart = -1;
- this.arabicEnd = -1;
- }
-
- /**
- * 处理数字字母混合输出 如:windos2000 | linliangyi2005@gmail.com
- *
- * @param context
- * @return
- */
- private boolean processMixLetter(AnalyzeContext context) {
- boolean needLock = false;
-
- if (this.start == -1) { // 当前的分词器尚未开始处理字符
- if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
- || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
- // 记录起始指针的位置,标明分词器进入处理状态
- this.start = context.getCursor();
- this.end = start;
- }
-
- } else { // 当前的分词器正在处理字符
- if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
- || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
- // 记录下可能的结束位置
- this.end = context.getCursor();
-
- } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
- && this.isLetterConnector(context.getCurrentChar())) {
- // 记录下可能的结束位置
- this.end = context.getCursor();
- } else {
- // 遇到非Letter字符,输出词元
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- this.start,
- this.end - this.start + 1,
- Lexeme.TYPE_LETTER);
- context.addLexeme(newLexeme);
+ LetterSegmenter() {
+ Arrays.sort(Letter_Connector);
+ Arrays.sort(Num_Connector);
this.start = -1;
this.end = -1;
- }
+ this.englishStart = -1;
+ this.englishEnd = -1;
+ this.arabicStart = -1;
+ this.arabicEnd = -1;
}
- // 判断缓冲区是否已经读完
- if (context.isBufferConsumed()) {
- if (this.start != -1 && this.end != -1) {
- // 缓冲以读完,输出词元
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- this.start,
- this.end - this.start + 1,
- Lexeme.TYPE_LETTER);
- context.addLexeme(newLexeme);
+ @Override
+ public void analyze(AnalyzeContext context) {
+ boolean bufferLockFlag = false;
+ // 处理英文字母
+ bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
+ // 处理阿拉伯字母
+ bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
+ // 处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
+ bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
+
+ // 判断是否锁定缓冲区
+ if (bufferLockFlag) {
+ context.lockBuffer(SEGMENTER_NAME);
+ } else {
+ // 对缓冲区解锁
+ context.unlockBuffer(SEGMENTER_NAME);
+ }
+ }
+
+ @Override
+ public void reset() {
this.start = -1;
this.end = -1;
- }
- }
-
- // 判断是否锁定缓冲区
- // 对缓冲区解锁
- needLock = this.start != -1 || this.end != -1;
- return needLock;
- }
-
- /**
- * 处理纯英文字母输出
- *
- * @param context
- * @return
- */
- private boolean processEnglishLetter(AnalyzeContext context) {
- boolean needLock = false;
-
- if (this.englishStart == -1) { // 当前的分词器尚未开始处理英文字符
- if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
- // 记录起始指针的位置,标明分词器进入处理状态
- this.englishStart = context.getCursor();
- this.englishEnd = this.englishStart;
- }
- } else { // 当前的分词器正在处理英文字符
- if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
- // 记录当前指针位置为结束位置
- this.englishEnd = context.getCursor();
- } else {
- // 遇到非English字符,输出词元
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- this.englishStart,
- this.englishEnd - this.englishStart + 1,
- Lexeme.TYPE_ENGLISH);
- context.addLexeme(newLexeme);
this.englishStart = -1;
this.englishEnd = -1;
- }
- }
-
- // 判断缓冲区是否已经读完
- if (context.isBufferConsumed()) {
- if (this.englishStart != -1 && this.englishEnd != -1) {
- // 缓冲以读完,输出词元
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- this.englishStart,
- this.englishEnd - this.englishStart + 1,
- Lexeme.TYPE_ENGLISH);
- context.addLexeme(newLexeme);
- this.englishStart = -1;
- this.englishEnd = -1;
- }
- }
-
- // 判断是否锁定缓冲区
- // 对缓冲区解锁
- needLock = this.englishStart != -1 || this.englishEnd != -1;
- return needLock;
- }
-
- /**
- * 处理阿拉伯数字输出
- *
- * @param context
- * @return
- */
- private boolean processArabicLetter(AnalyzeContext context) {
- boolean needLock = false;
-
- if (this.arabicStart == -1) { // 当前的分词器尚未开始处理数字字符
- if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
- // 记录起始指针的位置,标明分词器进入处理状态
- this.arabicStart = context.getCursor();
- this.arabicEnd = this.arabicStart;
- }
- } else { // 当前的分词器正在处理数字字符
- if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
- // 记录当前指针位置为结束位置
- this.arabicEnd = context.getCursor();
- } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
- && this.isNumConnector(context.getCurrentChar())) {
- // 不输出数字,但不标记结束
- } else {
- // //遇到非Arabic字符,输出词元
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- this.arabicStart,
- this.arabicEnd - this.arabicStart + 1,
- Lexeme.TYPE_ARABIC);
- context.addLexeme(newLexeme);
this.arabicStart = -1;
this.arabicEnd = -1;
- }
}
- // 判断缓冲区是否已经读完
- if (context.isBufferConsumed()) {
- if (this.arabicStart != -1 && this.arabicEnd != -1) {
- // 生成已切分的词元
- Lexeme newLexeme =
- new Lexeme(
- context.getBufferOffset(),
- this.arabicStart,
- this.arabicEnd - this.arabicStart + 1,
- Lexeme.TYPE_ARABIC);
- context.addLexeme(newLexeme);
- this.arabicStart = -1;
- this.arabicEnd = -1;
- }
+ /**
+ * 处理数字字母混合输出 如:windos2000 | linliangyi2005@gmail.com
+ *
+ * @param context
+ * @return
+ */
+ private boolean processMixLetter(AnalyzeContext context) {
+ boolean needLock = false;
+
+ if (this.start == -1) { // 当前的分词器尚未开始处理字符
+ if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
+ || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
+ // 记录起始指针的位置,标明分词器进入处理状态
+ this.start = context.getCursor();
+ this.end = start;
+ }
+
+ } else { // 当前的分词器正在处理字符
+ if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
+ || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
+ // 记录下可能的结束位置
+ this.end = context.getCursor();
+
+ } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
+ && this.isLetterConnector(context.getCurrentChar())) {
+ // 记录下可能的结束位置
+ this.end = context.getCursor();
+ } else {
+ // 遇到非Letter字符,输出词元
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.start,
+ this.end - this.start + 1,
+ Lexeme.TYPE_LETTER);
+ context.addLexeme(newLexeme);
+ this.start = -1;
+ this.end = -1;
+ }
+ }
+
+ // 判断缓冲区是否已经读完
+ if (context.isBufferConsumed()) {
+ if (this.start != -1 && this.end != -1) {
+ // 缓冲以读完,输出词元
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.start,
+ this.end - this.start + 1,
+ Lexeme.TYPE_LETTER);
+ context.addLexeme(newLexeme);
+ this.start = -1;
+ this.end = -1;
+ }
+ }
+
+ // 判断是否锁定缓冲区
+ // 对缓冲区解锁
+ needLock = this.start != -1 || this.end != -1;
+ return needLock;
}
- // 判断是否锁定缓冲区
- // 对缓冲区解锁
- needLock = this.arabicStart != -1 || this.arabicEnd != -1;
- return needLock;
- }
+ /**
+ * 处理纯英文字母输出
+ *
+ * @param context
+ * @return
+ */
+ private boolean processEnglishLetter(AnalyzeContext context) {
+ boolean needLock = false;
- /**
- * 判断是否是字母连接符号
- *
- * @param input
- * @return
- */
- private boolean isLetterConnector(char input) {
- int index = Arrays.binarySearch(Letter_Connector, input);
- return index >= 0;
- }
+ if (this.englishStart == -1) { // 当前的分词器尚未开始处理英文字符
+ if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
+ // 记录起始指针的位置,标明分词器进入处理状态
+ this.englishStart = context.getCursor();
+ this.englishEnd = this.englishStart;
+ }
+ } else { // 当前的分词器正在处理英文字符
+ if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
+ // 记录当前指针位置为结束位置
+ this.englishEnd = context.getCursor();
+ } else {
+ // 遇到非English字符,输出词元
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.englishStart,
+ this.englishEnd - this.englishStart + 1,
+ Lexeme.TYPE_ENGLISH);
+ context.addLexeme(newLexeme);
+ this.englishStart = -1;
+ this.englishEnd = -1;
+ }
+ }
- /**
- * 判断是否是数字连接符号
- *
- * @param input
- * @return
- */
- private boolean isNumConnector(char input) {
- int index = Arrays.binarySearch(Num_Connector, input);
- return index >= 0;
- }
+ // 判断缓冲区是否已经读完
+ if (context.isBufferConsumed()) {
+ if (this.englishStart != -1 && this.englishEnd != -1) {
+ // 缓冲以读完,输出词元
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.englishStart,
+ this.englishEnd - this.englishStart + 1,
+ Lexeme.TYPE_ENGLISH);
+ context.addLexeme(newLexeme);
+ this.englishStart = -1;
+ this.englishEnd = -1;
+ }
+ }
+
+ // 判断是否锁定缓冲区
+ // 对缓冲区解锁
+ needLock = this.englishStart != -1 || this.englishEnd != -1;
+ return needLock;
+ }
+
+ /**
+ * 处理阿拉伯数字输出
+ *
+ * @param context
+ * @return
+ */
+ private boolean processArabicLetter(AnalyzeContext context) {
+ boolean needLock = false;
+
+ if (this.arabicStart == -1) { // 当前的分词器尚未开始处理数字字符
+ if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
+ // 记录起始指针的位置,标明分词器进入处理状态
+ this.arabicStart = context.getCursor();
+ this.arabicEnd = this.arabicStart;
+ }
+ } else { // 当前的分词器正在处理数字字符
+ if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
+ // 记录当前指针位置为结束位置
+ this.arabicEnd = context.getCursor();
+ } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
+ && this.isNumConnector(context.getCurrentChar())) {
+ // 不输出数字,但不标记结束
+ } else {
+ // //遇到非Arabic字符,输出词元
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.arabicStart,
+ this.arabicEnd - this.arabicStart + 1,
+ Lexeme.TYPE_ARABIC);
+ context.addLexeme(newLexeme);
+ this.arabicStart = -1;
+ this.arabicEnd = -1;
+ }
+ }
+
+ // 判断缓冲区是否已经读完
+ if (context.isBufferConsumed()) {
+ if (this.arabicStart != -1 && this.arabicEnd != -1) {
+ // 生成已切分的词元
+ Lexeme newLexeme =
+ new Lexeme(
+ context.getBufferOffset(),
+ this.arabicStart,
+ this.arabicEnd - this.arabicStart + 1,
+ Lexeme.TYPE_ARABIC);
+ context.addLexeme(newLexeme);
+ this.arabicStart = -1;
+ this.arabicEnd = -1;
+ }
+ }
+
+ // 判断是否锁定缓冲区
+ // 对缓冲区解锁
+ needLock = this.arabicStart != -1 || this.arabicEnd != -1;
+ return needLock;
+ }
+
+ /**
+ * 判断是否是字母连接符号
+ *
+ * @param input
+ * @return
+ */
+ private boolean isLetterConnector(char input) {
+ int index = Arrays.binarySearch(Letter_Connector, input);
+ return index >= 0;
+ }
+
+ /**
+ * 判断是否是数字连接符号
+ *
+ * @param input
+ * @return
+ */
+ private boolean isNumConnector(char input) {
+ int index = Arrays.binarySearch(Num_Connector, input);
+ return index >= 0;
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/Lexeme.java b/src/main/java/com/rymcu/forest/lucene/core/Lexeme.java
index bba18be..e9eadec 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/Lexeme.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/Lexeme.java
@@ -1,26 +1,25 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.core;
@@ -28,252 +27,252 @@ package com.rymcu.forest.lucene.core;
* IK词元对象
*/
public class Lexeme implements Comparable {
- // lexemeType常量
- // 未知
- public static final int TYPE_UNKNOWN = 0;
- // 英文
- public static final int TYPE_ENGLISH = 1;
- // 数字
- public static final int TYPE_ARABIC = 2;
- // 英文数字混合
- public static final int TYPE_LETTER = 3;
- // 中文词元
- public static final int TYPE_CNWORD = 4;
- // 中文单字
- public static final int TYPE_CNCHAR = 64;
- // 日韩文字
- public static final int TYPE_OTHER_CJK = 8;
- // 中文数词
- public static final int TYPE_CNUM = 16;
- // 中文量词
- public static final int TYPE_COUNT = 32;
- // 中文数量词
- public static final int TYPE_CQUAN = 48;
+ // lexemeType常量
+ // 未知
+ public static final int TYPE_UNKNOWN = 0;
+ // 英文
+ public static final int TYPE_ENGLISH = 1;
+ // 数字
+ public static final int TYPE_ARABIC = 2;
+ // 英文数字混合
+ public static final int TYPE_LETTER = 3;
+ // 中文词元
+ public static final int TYPE_CNWORD = 4;
+ // 中文单字
+ public static final int TYPE_CNCHAR = 64;
+ // 日韩文字
+ public static final int TYPE_OTHER_CJK = 8;
+ // 中文数词
+ public static final int TYPE_CNUM = 16;
+ // 中文量词
+ public static final int TYPE_COUNT = 32;
+ // 中文数量词
+ public static final int TYPE_CQUAN = 48;
- // 词元的起始位移
- private int offset;
- // 词元的相对起始位置
- private int begin;
- // 词元的长度
- private int length;
- // 词元文本
- private String lexemeText;
- // 词元类型
- private int lexemeType;
+ // 词元的起始位移
+ private int offset;
+ // 词元的相对起始位置
+ private int begin;
+ // 词元的长度
+ private int length;
+ // 词元文本
+ private String lexemeText;
+ // 词元类型
+ private int lexemeType;
- public Lexeme(int offset, int begin, int length, int lexemeType) {
- this.offset = offset;
- this.begin = begin;
- if (length < 0) {
- throw new IllegalArgumentException("length < 0");
- }
- this.length = length;
- this.lexemeType = lexemeType;
- }
-
- /*
- * 判断词元相等算法 起始位置偏移、起始位置、终止位置相同
- * @see java.lang.Object#equals(Object o)
- */
- public boolean equals(Object o) {
- if (o == null) {
- return false;
+ public Lexeme(int offset, int begin, int length, int lexemeType) {
+ this.offset = offset;
+ this.begin = begin;
+ if (length < 0) {
+ throw new IllegalArgumentException("length < 0");
+ }
+ this.length = length;
+ this.lexemeType = lexemeType;
}
- if (this == o) {
- return true;
+ /*
+ * 判断词元相等算法 起始位置偏移、起始位置、终止位置相同
+ * @see java.lang.Object#equals(Object o)
+ */
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+
+ if (this == o) {
+ return true;
+ }
+
+ if (o instanceof Lexeme) {
+ Lexeme other = (Lexeme) o;
+ if (this.offset == other.getOffset() && this.begin == other.getBegin()
+ && this.length == other.getLength()) {
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
}
- if (o instanceof Lexeme) {
- Lexeme other = (Lexeme) o;
- if (this.offset == other.getOffset() && this.begin == other.getBegin()
- && this.length == other.getLength()) {
- return true;
- } else {
- return false;
- }
- } else {
- return false;
+ /*
+ * 词元哈希编码算法
+ * @see java.lang.Object#hashCode()
+ */
+ public int hashCode() {
+ int absBegin = getBeginPosition();
+ int absEnd = getEndPosition();
+ return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
- }
- /*
- * 词元哈希编码算法
- * @see java.lang.Object#hashCode()
- */
- public int hashCode() {
- int absBegin = getBeginPosition();
- int absEnd = getEndPosition();
- return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
- }
+ /*
+ * 词元在排序集合中的比较算法
+ * @see java.lang.Comparable#compareTo(java.lang.Object)
+ */
+ public int compareTo(Lexeme other) {
+ // 起始位置优先
+ if (this.begin < other.getBegin()) {
+ return -1;
+ } else if (this.begin == other.getBegin()) {
+ // 词元长度优先
+ if (this.length > other.getLength()) {
+ return -1;
+ } else if (this.length == other.getLength()) {
+ return 0;
+ } else {// this.length < other.getLength()
+ return 1;
+ }
- /*
- * 词元在排序集合中的比较算法
- * @see java.lang.Comparable#compareTo(java.lang.Object)
- */
- public int compareTo(Lexeme other) {
- // 起始位置优先
- if (this.begin < other.getBegin()) {
- return -1;
- } else if (this.begin == other.getBegin()) {
- // 词元长度优先
- if (this.length > other.getLength()) {
- return -1;
- } else if (this.length == other.getLength()) {
- return 0;
- } else {// this.length < other.getLength()
- return 1;
- }
-
- } else {// this.begin > other.getBegin()
- return 1;
+ } else {// this.begin > other.getBegin()
+ return 1;
+ }
}
- }
- public int getOffset() {
- return offset;
- }
-
- public void setOffset(int offset) {
- this.offset = offset;
- }
-
- public int getBegin() {
- return begin;
- }
-
- /**
- * 获取词元在文本中的起始位置
- * @return int
- */
- public int getBeginPosition() {
- return offset + begin;
- }
-
- public void setBegin(int begin) {
- this.begin = begin;
- }
-
- /**
- * 获取词元在文本中的结束位置
- * @return int
- */
- public int getEndPosition() {
- return offset + begin + length;
- }
-
- /**
- * 获取词元的字符长度
- * @return int
- */
- public int getLength() {
- return this.length;
- }
-
- public void setLength(int length) {
- if (this.length < 0) {
- throw new IllegalArgumentException("length < 0");
+ public int getOffset() {
+ return offset;
}
- this.length = length;
- }
- /**
- * 获取词元的文本内容
- * @return String
- */
- public String getLexemeText() {
- if (lexemeText == null) {
- return "";
+ public void setOffset(int offset) {
+ this.offset = offset;
}
- return lexemeText;
- }
- public void setLexemeText(String lexemeText) {
- if (lexemeText == null) {
- this.lexemeText = "";
- this.length = 0;
- } else {
- this.lexemeText = lexemeText;
- this.length = lexemeText.length();
+ public int getBegin() {
+ return begin;
}
- }
- /**
- * 获取词元类型
- * @return int
- */
- public int getLexemeType() {
- return lexemeType;
- }
-
- /**
- * 获取词元类型标示字符串
- * @return String
- */
- public String getLexemeTypeString() {
- switch (lexemeType) {
-
- case TYPE_ENGLISH:
- return "ENGLISH";
-
- case TYPE_ARABIC:
- return "ARABIC";
-
- case TYPE_LETTER:
- return "LETTER";
-
- case TYPE_CNWORD:
- return "CN_WORD";
-
- case TYPE_CNCHAR:
- return "CN_CHAR";
-
- case TYPE_OTHER_CJK:
- return "OTHER_CJK";
-
- case TYPE_COUNT:
- return "COUNT";
-
- case TYPE_CNUM:
- return "TYPE_CNUM";
-
- case TYPE_CQUAN:
- return "TYPE_CQUAN";
-
- default:
- return "UNKONW";
+ /**
+ * 获取词元在文本中的起始位置
+ * @return int
+ */
+ public int getBeginPosition() {
+ return offset + begin;
}
- }
- public void setLexemeType(int lexemeType) {
- this.lexemeType = lexemeType;
- }
-
- /**
- * 合并两个相邻的词元
- * @param l
- * @param lexemeType
- * @return boolean 词元是否成功合并
- */
- public boolean append(Lexeme l, int lexemeType) {
- if (l != null && this.getEndPosition() == l.getBeginPosition()) {
- this.length += l.getLength();
- this.lexemeType = lexemeType;
- return true;
- } else {
- return false;
+ public void setBegin(int begin) {
+ this.begin = begin;
}
- }
- /**
- *
- */
- public String toString() {
- StringBuffer strbuf = new StringBuffer();
- strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
- strbuf.append(" : ").append(this.lexemeText).append(" : \t");
- strbuf.append(this.getLexemeTypeString());
- return strbuf.toString();
- }
+ /**
+ * 获取词元在文本中的结束位置
+ * @return int
+ */
+ public int getEndPosition() {
+ return offset + begin + length;
+ }
+
+ /**
+ * 获取词元的字符长度
+ * @return int
+ */
+ public int getLength() {
+ return this.length;
+ }
+
+ public void setLength(int length) {
+ if (this.length < 0) {
+ throw new IllegalArgumentException("length < 0");
+ }
+ this.length = length;
+ }
+
+ /**
+ * 获取词元的文本内容
+ * @return String
+ */
+ public String getLexemeText() {
+ if (lexemeText == null) {
+ return "";
+ }
+ return lexemeText;
+ }
+
+ public void setLexemeText(String lexemeText) {
+ if (lexemeText == null) {
+ this.lexemeText = "";
+ this.length = 0;
+ } else {
+ this.lexemeText = lexemeText;
+ this.length = lexemeText.length();
+ }
+ }
+
+ /**
+ * 获取词元类型
+ * @return int
+ */
+ public int getLexemeType() {
+ return lexemeType;
+ }
+
+ /**
+ * 获取词元类型标示字符串
+ * @return String
+ */
+ public String getLexemeTypeString() {
+ switch (lexemeType) {
+
+ case TYPE_ENGLISH:
+ return "ENGLISH";
+
+ case TYPE_ARABIC:
+ return "ARABIC";
+
+ case TYPE_LETTER:
+ return "LETTER";
+
+ case TYPE_CNWORD:
+ return "CN_WORD";
+
+ case TYPE_CNCHAR:
+ return "CN_CHAR";
+
+ case TYPE_OTHER_CJK:
+ return "OTHER_CJK";
+
+ case TYPE_COUNT:
+ return "COUNT";
+
+ case TYPE_CNUM:
+ return "TYPE_CNUM";
+
+ case TYPE_CQUAN:
+ return "TYPE_CQUAN";
+
+ default:
+ return "UNKONW";
+ }
+ }
+
+ public void setLexemeType(int lexemeType) {
+ this.lexemeType = lexemeType;
+ }
+
+ /**
+ * 合并两个相邻的词元
+ * @param l
+ * @param lexemeType
+ * @return boolean 词元是否成功合并
+ */
+ public boolean append(Lexeme l, int lexemeType) {
+ if (l != null && this.getEndPosition() == l.getBeginPosition()) {
+ this.length += l.getLength();
+ this.lexemeType = lexemeType;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ *
+ */
+ public String toString() {
+ StringBuffer strbuf = new StringBuffer();
+ strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
+ strbuf.append(" : ").append(this.lexemeText).append(" : \t");
+ strbuf.append(this.getLexemeTypeString());
+ return strbuf.toString();
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/LexemePath.java b/src/main/java/com/rymcu/forest/lucene/core/LexemePath.java
index 1abff55..d121926 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/LexemePath.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/LexemePath.java
@@ -1,26 +1,25 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.core;
@@ -29,227 +28,227 @@ package com.rymcu.forest.lucene.core;
*/
class LexemePath extends QuickSortSet implements Comparable {
- // 起始位置
- private int pathBegin;
- // 结束
- private int pathEnd;
- // 词元链的有效字符长度
- private int payloadLength;
-
- LexemePath() {
- this.pathBegin = -1;
- this.pathEnd = -1;
- this.payloadLength = 0;
- }
-
- /**
- * 向LexemePath追加相交的Lexeme
- * @param lexeme
- * @return
- */
- boolean addCrossLexeme(Lexeme lexeme) {
- if (this.isEmpty()) {
- this.addLexeme(lexeme);
- this.pathBegin = lexeme.getBegin();
- this.pathEnd = lexeme.getBegin() + lexeme.getLength();
- this.payloadLength += lexeme.getLength();
- return true;
-
- } else if (this.checkCross(lexeme)) {
- this.addLexeme(lexeme);
- if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) {
- this.pathEnd = lexeme.getBegin() + lexeme.getLength();
- }
- this.payloadLength = this.pathEnd - this.pathBegin;
- return true;
-
- } else {
- return false;
+ // 起始位置
+ private int pathBegin;
+ // 结束
+ private int pathEnd;
+ // 词元链的有效字符长度
+ private int payloadLength;
+ LexemePath() {
+ this.pathBegin = -1;
+ this.pathEnd = -1;
+ this.payloadLength = 0;
}
- }
- /**
- * 向LexemePath追加不相交的Lexeme
- * @param lexeme
- * @return
- */
- boolean addNotCrossLexeme(Lexeme lexeme) {
- if (this.isEmpty()) {
- this.addLexeme(lexeme);
- this.pathBegin = lexeme.getBegin();
- this.pathEnd = lexeme.getBegin() + lexeme.getLength();
- this.payloadLength += lexeme.getLength();
- return true;
-
- } else if (this.checkCross(lexeme)) {
- return false;
-
- } else {
- this.addLexeme(lexeme);
- this.payloadLength += lexeme.getLength();
- Lexeme head = this.peekFirst();
- this.pathBegin = head.getBegin();
- Lexeme tail = this.peekLast();
- this.pathEnd = tail.getBegin() + tail.getLength();
- return true;
-
- }
- }
-
- /**
- * 移除尾部的Lexeme
- * @return
- */
- Lexeme removeTail() {
- Lexeme tail = this.pollLast();
- if (this.isEmpty()) {
- this.pathBegin = -1;
- this.pathEnd = -1;
- this.payloadLength = 0;
- } else {
- this.payloadLength -= tail.getLength();
- Lexeme newTail = this.peekLast();
- this.pathEnd = newTail.getBegin() + newTail.getLength();
- }
- return tail;
- }
-
- /**
- * 检测词元位置交叉(有歧义的切分)
- * @param lexeme
- * @return
- */
- boolean checkCross(Lexeme lexeme) {
- return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
- || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()
- + lexeme.getLength());
- }
-
- int getPathBegin() {
- return pathBegin;
- }
-
- int getPathEnd() {
- return pathEnd;
- }
-
- /**
- * 获取Path的有效词长
- * @return
- */
- int getPayloadLength() {
- return this.payloadLength;
- }
-
- /**
- * 获取LexemePath的路径长度
- * @return
- */
- int getPathLength() {
- return this.pathEnd - this.pathBegin;
- }
-
- /**
- * X权重(词元长度积)
- * @return
- */
- int getXWeight() {
- int product = 1;
- Cell c = this.getHead();
- while (c != null && c.getLexeme() != null) {
- product *= c.getLexeme().getLength();
- c = c.getNext();
- }
- return product;
- }
-
- /**
- * 词元位置权重
- * @return
- */
- int getPWeight() {
- int pWeight = 0;
- int p = 0;
- Cell c = this.getHead();
- while (c != null && c.getLexeme() != null) {
- p++;
- pWeight += p * c.getLexeme().getLength();
- c = c.getNext();
- }
- return pWeight;
- }
-
- LexemePath copy() {
- LexemePath theCopy = new LexemePath();
- theCopy.pathBegin = this.pathBegin;
- theCopy.pathEnd = this.pathEnd;
- theCopy.payloadLength = this.payloadLength;
- Cell c = this.getHead();
- while (c != null && c.getLexeme() != null) {
- theCopy.addLexeme(c.getLexeme());
- c = c.getNext();
- }
- return theCopy;
- }
-
- public int compareTo(LexemePath o) {
- // 比较有效文本长度
- if (this.payloadLength > o.payloadLength) {
- return -1;
- } else if (this.payloadLength < o.payloadLength) {
- return 1;
- } else {
- // 比较词元个数,越少越好
- if (this.size() < o.size()) {
- return -1;
- } else if (this.size() > o.size()) {
- return 1;
- } else {
- // 路径跨度越大越好
- if (this.getPathLength() > o.getPathLength()) {
- return -1;
- } else if (this.getPathLength() < o.getPathLength()) {
- return 1;
- } else {
- // 根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
- if (this.pathEnd > o.pathEnd) {
- return -1;
- } else if (pathEnd < o.pathEnd) {
- return 1;
- } else {
- // 词长越平均越好
- if (this.getXWeight() > o.getXWeight()) {
- return -1;
- } else if (this.getXWeight() < o.getXWeight()) {
- return 1;
- } else {
- // 词元位置权重比较
- if (this.getPWeight() > o.getPWeight()) {
- return -1;
- } else if (this.getPWeight() < o.getPWeight()) {
- return 1;
- }
+ /**
+ * 向LexemePath追加相交的Lexeme
+ * @param lexeme
+ * @return
+ */
+ boolean addCrossLexeme(Lexeme lexeme) {
+ if (this.isEmpty()) {
+ this.addLexeme(lexeme);
+ this.pathBegin = lexeme.getBegin();
+ this.pathEnd = lexeme.getBegin() + lexeme.getLength();
+ this.payloadLength += lexeme.getLength();
+ return true;
+ } else if (this.checkCross(lexeme)) {
+ this.addLexeme(lexeme);
+ if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) {
+ this.pathEnd = lexeme.getBegin() + lexeme.getLength();
}
- }
- }
- }
- }
- return 0;
- }
+ this.payloadLength = this.pathEnd - this.pathBegin;
+ return true;
- public String toString() {
- StringBuffer sb = new StringBuffer();
- sb.append("pathBegin : ").append(pathBegin).append("\r\n");
- sb.append("pathEnd : ").append(pathEnd).append("\r\n");
- sb.append("payloadLength : ").append(payloadLength).append("\r\n");
- Cell head = this.getHead();
- while (head != null) {
- sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
- head = head.getNext();
+ } else {
+ return false;
+
+ }
+ }
+
+ /**
+ * 向LexemePath追加不相交的Lexeme
+ * @param lexeme
+ * @return
+ */
+ boolean addNotCrossLexeme(Lexeme lexeme) {
+ if (this.isEmpty()) {
+ this.addLexeme(lexeme);
+ this.pathBegin = lexeme.getBegin();
+ this.pathEnd = lexeme.getBegin() + lexeme.getLength();
+ this.payloadLength += lexeme.getLength();
+ return true;
+
+ } else if (this.checkCross(lexeme)) {
+ return false;
+
+ } else {
+ this.addLexeme(lexeme);
+ this.payloadLength += lexeme.getLength();
+ Lexeme head = this.peekFirst();
+ this.pathBegin = head.getBegin();
+ Lexeme tail = this.peekLast();
+ this.pathEnd = tail.getBegin() + tail.getLength();
+ return true;
+
+ }
+ }
+
+ /**
+ * 移除尾部的Lexeme
+ * @return
+ */
+ Lexeme removeTail() {
+ Lexeme tail = this.pollLast();
+ if (this.isEmpty()) {
+ this.pathBegin = -1;
+ this.pathEnd = -1;
+ this.payloadLength = 0;
+ } else {
+ this.payloadLength -= tail.getLength();
+ Lexeme newTail = this.peekLast();
+ this.pathEnd = newTail.getBegin() + newTail.getLength();
+ }
+ return tail;
+ }
+
+ /**
+ * 检测词元位置交叉(有歧义的切分)
+ * @param lexeme
+ * @return
+ */
+ boolean checkCross(Lexeme lexeme) {
+ return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
+ || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()
+ + lexeme.getLength());
+ }
+
+ int getPathBegin() {
+ return pathBegin;
+ }
+
+ int getPathEnd() {
+ return pathEnd;
+ }
+
+ /**
+ * 获取Path的有效词长
+ * @return
+ */
+ int getPayloadLength() {
+ return this.payloadLength;
+ }
+
+ /**
+ * 获取LexemePath的路径长度
+ * @return
+ */
+ int getPathLength() {
+ return this.pathEnd - this.pathBegin;
+ }
+
+ /**
+ * X权重(词元长度积)
+ * @return
+ */
+ int getXWeight() {
+ int product = 1;
+ Cell c = this.getHead();
+ while (c != null && c.getLexeme() != null) {
+ product *= c.getLexeme().getLength();
+ c = c.getNext();
+ }
+ return product;
+ }
+
+ /**
+ * 词元位置权重
+ * @return
+ */
+ int getPWeight() {
+ int pWeight = 0;
+ int p = 0;
+ Cell c = this.getHead();
+ while (c != null && c.getLexeme() != null) {
+ p++;
+ pWeight += p * c.getLexeme().getLength();
+ c = c.getNext();
+ }
+ return pWeight;
+ }
+
+ LexemePath copy() {
+ LexemePath theCopy = new LexemePath();
+ theCopy.pathBegin = this.pathBegin;
+ theCopy.pathEnd = this.pathEnd;
+ theCopy.payloadLength = this.payloadLength;
+ Cell c = this.getHead();
+ while (c != null && c.getLexeme() != null) {
+ theCopy.addLexeme(c.getLexeme());
+ c = c.getNext();
+ }
+ return theCopy;
+ }
+
+ public int compareTo(LexemePath o) {
+ // 比较有效文本长度
+ if (this.payloadLength > o.payloadLength) {
+ return -1;
+ } else if (this.payloadLength < o.payloadLength) {
+ return 1;
+ } else {
+ // 比较词元个数,越少越好
+ if (this.size() < o.size()) {
+ return -1;
+ } else if (this.size() > o.size()) {
+ return 1;
+ } else {
+ // 路径跨度越大越好
+ if (this.getPathLength() > o.getPathLength()) {
+ return -1;
+ } else if (this.getPathLength() < o.getPathLength()) {
+ return 1;
+ } else {
+ // 根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
+ if (this.pathEnd > o.pathEnd) {
+ return -1;
+ } else if (pathEnd < o.pathEnd) {
+ return 1;
+ } else {
+ // 词长越平均越好
+ if (this.getXWeight() > o.getXWeight()) {
+ return -1;
+ } else if (this.getXWeight() < o.getXWeight()) {
+ return 1;
+ } else {
+ // 词元位置权重比较
+ if (this.getPWeight() > o.getPWeight()) {
+ return -1;
+ } else if (this.getPWeight() < o.getPWeight()) {
+ return 1;
+ }
+
+ }
+ }
+ }
+ }
+ }
+ return 0;
+ }
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append("pathBegin : ").append(pathBegin).append("\r\n");
+ sb.append("pathEnd : ").append(pathEnd).append("\r\n");
+ sb.append("payloadLength : ").append(payloadLength).append("\r\n");
+ Cell head = this.getHead();
+ while (head != null) {
+ sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
+ head = head.getNext();
+ }
+ return sb.toString();
}
- return sb.toString();
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/core/QuickSortSet.java b/src/main/java/com/rymcu/forest/lucene/core/QuickSortSet.java
index cc1971b..f06c5ac 100644
--- a/src/main/java/com/rymcu/forest/lucene/core/QuickSortSet.java
+++ b/src/main/java/com/rymcu/forest/lucene/core/QuickSortSet.java
@@ -1,26 +1,25 @@
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.core;
@@ -28,212 +27,212 @@ package com.rymcu.forest.lucene.core;
* IK分词器专用的Lexem快速排序集合
*/
class QuickSortSet {
- // 链表头
- private Cell head;
- // 链表尾
- private Cell tail;
- // 链表的实际大小
- private int size;
+ // 链表头
+ private Cell head;
+ // 链表尾
+ private Cell tail;
+ // 链表的实际大小
+ private int size;
- QuickSortSet() {
- this.size = 0;
- }
+ QuickSortSet() {
+ this.size = 0;
+ }
- /**
- * 向链表集合添加词元
- * @param lexeme
- */
- boolean addLexeme(Lexeme lexeme) {
- Cell newCell = new Cell(lexeme);
- if (this.size == 0) {
- this.head = newCell;
- this.tail = newCell;
- this.size++;
- return true;
+ /**
+ * 向链表集合添加词元
+ * @param lexeme
+ */
+ boolean addLexeme(Lexeme lexeme) {
+ Cell newCell = new Cell(lexeme);
+ if (this.size == 0) {
+ this.head = newCell;
+ this.tail = newCell;
+ this.size++;
+ return true;
- } else {
- if (this.tail.compareTo(newCell) == 0) {// 词元与尾部词元相同,不放入集合
+ } else {
+ if (this.tail.compareTo(newCell) == 0) {// 词元与尾部词元相同,不放入集合
+ return false;
+
+ } else if (this.tail.compareTo(newCell) < 0) {// 词元接入链表尾部
+ this.tail.next = newCell;
+ newCell.prev = this.tail;
+ this.tail = newCell;
+ this.size++;
+ return true;
+
+ } else if (this.head.compareTo(newCell) > 0) {// 词元接入链表头部
+ this.head.prev = newCell;
+ newCell.next = this.head;
+ this.head = newCell;
+ this.size++;
+ return true;
+
+ } else {
+ // 从尾部上逆
+ Cell index = this.tail;
+ while (index != null && index.compareTo(newCell) > 0) {
+ index = index.prev;
+ }
+ if (index.compareTo(newCell) == 0) {// 词元与集合中的词元重复,不放入集合
+ return false;
+
+ } else if (index.compareTo(newCell) < 0) {// 词元插入链表中的某个位置
+ newCell.prev = index;
+ newCell.next = index.next;
+ index.next.prev = newCell;
+ index.next = newCell;
+ this.size++;
+ return true;
+ }
+ }
+ }
return false;
+ }
- } else if (this.tail.compareTo(newCell) < 0) {// 词元接入链表尾部
- this.tail.next = newCell;
- newCell.prev = this.tail;
- this.tail = newCell;
- this.size++;
- return true;
-
- } else if (this.head.compareTo(newCell) > 0) {// 词元接入链表头部
- this.head.prev = newCell;
- newCell.next = this.head;
- this.head = newCell;
- this.size++;
- return true;
-
- } else {
- // 从尾部上逆
- Cell index = this.tail;
- while (index != null && index.compareTo(newCell) > 0) {
- index = index.prev;
+ /**
+ * 返回链表头部元素
+ * @return
+ */
+ Lexeme peekFirst() {
+ if (this.head != null) {
+ return this.head.lexeme;
}
- if (index.compareTo(newCell) == 0) {// 词元与集合中的词元重复,不放入集合
- return false;
+ return null;
+ }
- } else if (index.compareTo(newCell) < 0) {// 词元插入链表中的某个位置
- newCell.prev = index;
- newCell.next = index.next;
- index.next.prev = newCell;
- index.next = newCell;
- this.size++;
- return true;
+ /**
+ * 取出链表集合的第一个元素
+ * @return Lexeme
+ */
+ Lexeme pollFirst() {
+ if (this.size == 1) {
+ Lexeme first = this.head.lexeme;
+ this.head = null;
+ this.tail = null;
+ this.size--;
+ return first;
+ } else if (this.size > 1) {
+ Lexeme first = this.head.lexeme;
+ this.head = this.head.next;
+ this.size--;
+ return first;
+ } else {
+ return null;
}
- }
- }
- return false;
- }
-
- /**
- * 返回链表头部元素
- * @return
- */
- Lexeme peekFirst() {
- if (this.head != null) {
- return this.head.lexeme;
- }
- return null;
- }
-
- /**
- * 取出链表集合的第一个元素
- * @return Lexeme
- */
- Lexeme pollFirst() {
- if (this.size == 1) {
- Lexeme first = this.head.lexeme;
- this.head = null;
- this.tail = null;
- this.size--;
- return first;
- } else if (this.size > 1) {
- Lexeme first = this.head.lexeme;
- this.head = this.head.next;
- this.size--;
- return first;
- } else {
- return null;
- }
- }
-
- /**
- * 返回链表尾部元素
- * @return
- */
- Lexeme peekLast() {
- if (this.tail != null) {
- return this.tail.lexeme;
- }
- return null;
- }
-
- /**
- * 取出链表集合的最后一个元素
- * @return Lexeme
- */
- Lexeme pollLast() {
- if (this.size == 1) {
- Lexeme last = this.head.lexeme;
- this.head = null;
- this.tail = null;
- this.size--;
- return last;
-
- } else if (this.size > 1) {
- Lexeme last = this.tail.lexeme;
- this.tail = this.tail.prev;
- this.size--;
- return last;
-
- } else {
- return null;
- }
- }
-
- /**
- * 返回集合大小
- * @return
- */
- int size() {
- return this.size;
- }
-
- /**
- * 判断集合是否为空
- * @return
- */
- boolean isEmpty() {
- return this.size == 0;
- }
-
- /**
- * 返回lexeme链的头部
- * @return
- */
- Cell getHead() {
- return this.head;
- }
-
- /**
- *
- * IK 中文分词 版本 5.0
- * IK Analyzer release 5.0
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * 源代码由林良益(linliangyi2005@gmail.com)提供
- * 版权声明 2012,乌龙茶工作室
- * provided by Linliangyi and copyright 2012 by Oolong studio
- *
- * QuickSortSet集合单元
- *
- */
- class Cell implements Comparable {
- private Cell prev;
- private Cell next;
- private Lexeme lexeme;
-
- Cell(Lexeme lexeme) {
- if (lexeme == null) {
- throw new IllegalArgumentException("lexeme must not be null");
- }
- this.lexeme = lexeme;
}
- public int compareTo(Cell o) {
- return this.lexeme.compareTo(o.lexeme);
+ /**
+ * 返回链表尾部元素
+ * @return
+ */
+ Lexeme peekLast() {
+ if (this.tail != null) {
+ return this.tail.lexeme;
+ }
+ return null;
}
- public Cell getPrev() {
- return this.prev;
+ /**
+ * 取出链表集合的最后一个元素
+ * @return Lexeme
+ */
+ Lexeme pollLast() {
+ if (this.size == 1) {
+ Lexeme last = this.head.lexeme;
+ this.head = null;
+ this.tail = null;
+ this.size--;
+ return last;
+
+ } else if (this.size > 1) {
+ Lexeme last = this.tail.lexeme;
+ this.tail = this.tail.prev;
+ this.size--;
+ return last;
+
+ } else {
+ return null;
+ }
}
- public Cell getNext() {
- return this.next;
+ /**
+ * 返回集合大小
+ * @return
+ */
+ int size() {
+ return this.size;
}
- public Lexeme getLexeme() {
- return this.lexeme;
+ /**
+ * 判断集合是否为空
+ * @return
+ */
+ boolean isEmpty() {
+ return this.size == 0;
+ }
+
+ /**
+ * 返回lexeme链的头部
+ * @return
+ */
+ Cell getHead() {
+ return this.head;
+ }
+
+ /**
+ *
+ * IK 中文分词 版本 5.0
+ * IK Analyzer release 5.0
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ *
+ * QuickSortSet集合单元
+ *
+ */
+ class Cell implements Comparable {
+ private Cell prev;
+ private Cell next;
+ private Lexeme lexeme;
+
+ Cell(Lexeme lexeme) {
+ if (lexeme == null) {
+ throw new IllegalArgumentException("lexeme must not be null");
+ }
+ this.lexeme = lexeme;
+ }
+
+ public int compareTo(Cell o) {
+ return this.lexeme.compareTo(o.lexeme);
+ }
+
+ public Cell getPrev() {
+ return this.prev;
+ }
+
+ public Cell getNext() {
+ return this.next;
+ }
+
+ public Lexeme getLexeme() {
+ return this.lexeme;
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/dic/DictSegment.java b/src/main/java/com/rymcu/forest/lucene/dic/DictSegment.java
index 6df8084..4ae7dd6 100644
--- a/src/main/java/com/rymcu/forest/lucene/dic/DictSegment.java
+++ b/src/main/java/com/rymcu/forest/lucene/dic/DictSegment.java
@@ -1,27 +1,25 @@
/**
- *
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.dic;
@@ -34,295 +32,295 @@ import java.util.Map;
*/
class DictSegment implements Comparable {
- // 公用字典表,存储汉字
- private static final Map charMap = new HashMap(16,
- 0.95f);
- // 数组大小上限
- private static final int ARRAY_LENGTH_LIMIT = 3;
+ // 公用字典表,存储汉字
+ private static final Map charMap = new HashMap(16,
+ 0.95f);
+ // 数组大小上限
+ private static final int ARRAY_LENGTH_LIMIT = 3;
- // Map存储结构
- private Map childrenMap;
- // 数组方式存储结构
- private DictSegment[] childrenArray;
+ // Map存储结构
+ private Map childrenMap;
+ // 数组方式存储结构
+ private DictSegment[] childrenArray;
- // 当前节点上存储的字符
- private Character nodeChar;
- // 当前节点存储的Segment数目
- // storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
- private int storeSize = 0;
- // 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
- private int nodeState = 0;
+ // 当前节点上存储的字符
+ private Character nodeChar;
+ // 当前节点存储的Segment数目
+ // storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
+ private int storeSize = 0;
+ // 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
+ private int nodeState = 0;
- DictSegment(Character nodeChar) {
- if (nodeChar == null) {
- throw new IllegalArgumentException("参数为空异常,字符不能为空");
- }
- this.nodeChar = nodeChar;
- }
-
- Character getNodeChar() {
- return nodeChar;
- }
-
- /*
- * 判断是否有下一个节点
- */
- boolean hasNextNode() {
- return this.storeSize > 0;
- }
-
- /**
- * 匹配词段
- * @param charArray
- * @return Hit
- */
- Hit match(char[] charArray) {
- return this.match(charArray, 0, charArray.length, null);
- }
-
- /**
- * 匹配词段
- * @param charArray
- * @param begin
- * @param length
- * @return Hit
- */
- Hit match(char[] charArray, int begin, int length) {
- return this.match(charArray, begin, length, null);
- }
-
- /**
- * 匹配词段
- * @param charArray
- * @param begin
- * @param length
- * @param searchHit
- * @return Hit
- */
- Hit match(char[] charArray, int begin, int length, Hit searchHit) {
-
- if (searchHit == null) {
- // 如果hit为空,新建
- searchHit = new Hit();
- // 设置hit的其实文本位置
- searchHit.setBegin(begin);
- } else {
- // 否则要将HIT状态重置
- searchHit.setUnmatch();
- }
- // 设置hit的当前处理位置
- searchHit.setEnd(begin);
-
- Character keyChar = new Character(charArray[begin]);
- DictSegment ds = null;
-
- // 引用实例变量为本地变量,避免查询时遇到更新的同步问题
- DictSegment[] segmentArray = this.childrenArray;
- Map segmentMap = this.childrenMap;
-
- // STEP1 在节点中查找keyChar对应的DictSegment
- if (segmentArray != null) {
- // 在数组中查找
- DictSegment keySegment = new DictSegment(keyChar);
- int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
- if (position >= 0) {
- ds = segmentArray[position];
- }
-
- } else if (segmentMap != null) {
- // 在map中查找
- ds = (DictSegment) segmentMap.get(keyChar);
- }
-
- // STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
- if (ds != null) {
- if (length > 1) {
- // 词未匹配完,继续往下搜索
- return ds.match(charArray, begin + 1, length - 1, searchHit);
- } else if (length == 1) {
-
- // 搜索最后一个char
- if (ds.nodeState == 1) {
- // 添加HIT状态为完全匹配
- searchHit.setMatch();
+ DictSegment(Character nodeChar) {
+ if (nodeChar == null) {
+ throw new IllegalArgumentException("参数为空异常,字符不能为空");
}
- if (ds.hasNextNode()) {
- // 添加HIT状态为前缀匹配
- searchHit.setPrefix();
- // 记录当前位置的DictSegment
- searchHit.setMatchedDictSegment(ds);
+ this.nodeChar = nodeChar;
+ }
+
+ Character getNodeChar() {
+ return nodeChar;
+ }
+
+ /*
+ * 判断是否有下一个节点
+ */
+ boolean hasNextNode() {
+ return this.storeSize > 0;
+ }
+
+ /**
+ * 匹配词段
+ * @param charArray
+ * @return Hit
+ */
+ Hit match(char[] charArray) {
+ return this.match(charArray, 0, charArray.length, null);
+ }
+
+ /**
+ * 匹配词段
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return Hit
+ */
+ Hit match(char[] charArray, int begin, int length) {
+ return this.match(charArray, begin, length, null);
+ }
+
+ /**
+ * 匹配词段
+ * @param charArray
+ * @param begin
+ * @param length
+ * @param searchHit
+ * @return Hit
+ */
+ Hit match(char[] charArray, int begin, int length, Hit searchHit) {
+
+ if (searchHit == null) {
+ // 如果hit为空,新建
+ searchHit = new Hit();
+ // 设置hit的其实文本位置
+ searchHit.setBegin(begin);
+ } else {
+ // 否则要将HIT状态重置
+ searchHit.setUnmatch();
}
+ // 设置hit的当前处理位置
+ searchHit.setEnd(begin);
+
+ Character keyChar = new Character(charArray[begin]);
+ DictSegment ds = null;
+
+ // 引用实例变量为本地变量,避免查询时遇到更新的同步问题
+ DictSegment[] segmentArray = this.childrenArray;
+ Map segmentMap = this.childrenMap;
+
+ // STEP1 在节点中查找keyChar对应的DictSegment
+ if (segmentArray != null) {
+ // 在数组中查找
+ DictSegment keySegment = new DictSegment(keyChar);
+ int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
+ if (position >= 0) {
+ ds = segmentArray[position];
+ }
+
+ } else if (segmentMap != null) {
+ // 在map中查找
+ ds = (DictSegment) segmentMap.get(keyChar);
+ }
+
+ // STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
+ if (ds != null) {
+ if (length > 1) {
+ // 词未匹配完,继续往下搜索
+ return ds.match(charArray, begin + 1, length - 1, searchHit);
+ } else if (length == 1) {
+
+ // 搜索最后一个char
+ if (ds.nodeState == 1) {
+ // 添加HIT状态为完全匹配
+ searchHit.setMatch();
+ }
+ if (ds.hasNextNode()) {
+ // 添加HIT状态为前缀匹配
+ searchHit.setPrefix();
+ // 记录当前位置的DictSegment
+ searchHit.setMatchedDictSegment(ds);
+ }
+ return searchHit;
+ }
+
+ }
+ // STEP3 没有找到DictSegment, 将HIT设置为不匹配
return searchHit;
- }
-
- }
- // STEP3 没有找到DictSegment, 将HIT设置为不匹配
- return searchHit;
- }
-
- /**
- * 加载填充词典片段
- * @param charArray
- */
- void fillSegment(char[] charArray) {
- this.fillSegment(charArray, 0, charArray.length, 1);
- }
-
- /**
- * 屏蔽词典中的一个词
- * @param charArray
- */
- void disableSegment(char[] charArray) {
- this.fillSegment(charArray, 0, charArray.length, 0);
- }
-
- /**
- * 加载填充词典片段
- * @param charArray
- * @param begin
- * @param length
- * @param enabled
- */
- private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
- // 获取字典表中的汉字对象
- Character beginChar = new Character(charArray[begin]);
- Character keyChar = charMap.get(beginChar);
- // 字典中没有该字,则将其添加入字典
- if (keyChar == null) {
- charMap.put(beginChar, beginChar);
- keyChar = beginChar;
}
- // 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
- DictSegment ds = lookforSegment(keyChar, enabled);
- if (ds != null) {
- // 处理keyChar对应的segment
- if (length > 1) {
- // 词元还没有完全加入词典树
- ds.fillSegment(charArray, begin + 1, length - 1, enabled);
- } else if (length == 1) {
- // 已经是词元的最后一个char,设置当前节点状态为enabled,
- // enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
- ds.nodeState = enabled;
- }
+ /**
+ * 加载填充词典片段
+ * @param charArray
+ */
+ void fillSegment(char[] charArray) {
+ this.fillSegment(charArray, 0, charArray.length, 1);
}
- }
+ /**
+ * 屏蔽词典中的一个词
+ * @param charArray
+ */
+ void disableSegment(char[] charArray) {
+ this.fillSegment(charArray, 0, charArray.length, 0);
+ }
- /**
- * 查找本节点下对应的keyChar的segment *
- * @param keyChar
- * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
- * @return
- */
- private DictSegment lookforSegment(Character keyChar, int create) {
+ /**
+ * 加载填充词典片段
+ * @param charArray
+ * @param begin
+ * @param length
+ * @param enabled
+ */
+ private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
+ // 获取字典表中的汉字对象
+ Character beginChar = new Character(charArray[begin]);
+ Character keyChar = charMap.get(beginChar);
+ // 字典中没有该字,则将其添加入字典
+ if (keyChar == null) {
+ charMap.put(beginChar, beginChar);
+ keyChar = beginChar;
+ }
- DictSegment ds = null;
+ // 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
+ DictSegment ds = lookforSegment(keyChar, enabled);
+ if (ds != null) {
+ // 处理keyChar对应的segment
+ if (length > 1) {
+ // 词元还没有完全加入词典树
+ ds.fillSegment(charArray, begin + 1, length - 1, enabled);
+ } else if (length == 1) {
+ // 已经是词元的最后一个char,设置当前节点状态为enabled,
+ // enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
+ ds.nodeState = enabled;
+ }
+ }
- if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
- // 获取数组容器,如果数组未创建则创建数组
- DictSegment[] segmentArray = getChildrenArray();
- // 搜寻数组
- DictSegment keySegment = new DictSegment(keyChar);
- int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
- if (position >= 0) {
- ds = segmentArray[position];
- }
+ }
- // 遍历数组后没有找到对应的segment
- if (ds == null && create == 1) {
- ds = keySegment;
- if (this.storeSize < ARRAY_LENGTH_LIMIT) {
- // 数组容量未满,使用数组存储
- segmentArray[this.storeSize] = ds;
- // segment数目+1
- this.storeSize++;
- Arrays.sort(segmentArray, 0, this.storeSize);
+ /**
+ * 查找本节点下对应的keyChar的segment *
+ * @param keyChar
+ * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
+ * @return
+ */
+ private DictSegment lookforSegment(Character keyChar, int create) {
+
+ DictSegment ds = null;
+
+ if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
+ // 获取数组容器,如果数组未创建则创建数组
+ DictSegment[] segmentArray = getChildrenArray();
+ // 搜寻数组
+ DictSegment keySegment = new DictSegment(keyChar);
+ int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
+ if (position >= 0) {
+ ds = segmentArray[position];
+ }
+
+ // 遍历数组后没有找到对应的segment
+ if (ds == null && create == 1) {
+ ds = keySegment;
+ if (this.storeSize < ARRAY_LENGTH_LIMIT) {
+ // 数组容量未满,使用数组存储
+ segmentArray[this.storeSize] = ds;
+ // segment数目+1
+ this.storeSize++;
+ Arrays.sort(segmentArray, 0, this.storeSize);
+
+ } else {
+ // 数组容量已满,切换Map存储
+ // 获取Map容器,如果Map未创建,则创建Map
+ Map segmentMap = getChildrenMap();
+ // 将数组中的segment迁移到Map中
+ migrate(segmentArray, segmentMap);
+ // 存储新的segment
+ segmentMap.put(keyChar, ds);
+ // segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
+ this.storeSize++;
+ // 释放当前的数组引用
+ this.childrenArray = null;
+ }
+
+ }
} else {
- // 数组容量已满,切换Map存储
- // 获取Map容器,如果Map未创建,则创建Map
- Map segmentMap = getChildrenMap();
- // 将数组中的segment迁移到Map中
- migrate(segmentArray, segmentMap);
- // 存储新的segment
- segmentMap.put(keyChar, ds);
- // segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
- this.storeSize++;
- // 释放当前的数组引用
- this.childrenArray = null;
+ // 获取Map容器,如果Map未创建,则创建Map
+ Map segmentMap = getChildrenMap();
+ // 搜索Map
+ ds = (DictSegment) segmentMap.get(keyChar);
+ if (ds == null && create == 1) {
+ // 构造新的segment
+ ds = new DictSegment(keyChar);
+ segmentMap.put(keyChar, ds);
+ // 当前节点存储segment数目+1
+ this.storeSize++;
+ }
}
- }
-
- } else {
- // 获取Map容器,如果Map未创建,则创建Map
- Map segmentMap = getChildrenMap();
- // 搜索Map
- ds = (DictSegment) segmentMap.get(keyChar);
- if (ds == null && create == 1) {
- // 构造新的segment
- ds = new DictSegment(keyChar);
- segmentMap.put(keyChar, ds);
- // 当前节点存储segment数目+1
- this.storeSize++;
- }
+ return ds;
}
- return ds;
- }
-
- /**
- * 获取数组容器
- * 线程同步方法
- */
- private DictSegment[] getChildrenArray() {
- if (this.childrenArray == null) {
- synchronized (this) {
+ /**
+ * 获取数组容器
+ * 线程同步方法
+ */
+ private DictSegment[] getChildrenArray() {
if (this.childrenArray == null) {
- this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
+ synchronized (this) {
+ if (this.childrenArray == null) {
+ this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
+ }
+ }
}
- }
+ return this.childrenArray;
}
- return this.childrenArray;
- }
- /**
- * 获取Map容器
- * 线程同步方法
- */
- private Map getChildrenMap() {
- if (this.childrenMap == null) {
- synchronized (this) {
+ /**
+ * 获取Map容器
+ * 线程同步方法
+ */
+ private Map getChildrenMap() {
if (this.childrenMap == null) {
- this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2, 0.8f);
+ synchronized (this) {
+ if (this.childrenMap == null) {
+ this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2, 0.8f);
+ }
+ }
}
- }
+ return this.childrenMap;
}
- return this.childrenMap;
- }
- /**
- * 将数组中的segment迁移到Map中
- * @param segmentArray
- */
- private void migrate(DictSegment[] segmentArray, Map segmentMap) {
- for (DictSegment segment : segmentArray) {
- if (segment != null) {
- segmentMap.put(segment.nodeChar, segment);
- }
+ /**
+ * 将数组中的segment迁移到Map中
+ * @param segmentArray
+ */
+ private void migrate(DictSegment[] segmentArray, Map segmentMap) {
+ for (DictSegment segment : segmentArray) {
+ if (segment != null) {
+ segmentMap.put(segment.nodeChar, segment);
+ }
+ }
}
- }
- /**
- * 实现Comparable接口
- * @param o
- * @return int
- */
- @Override
- public int compareTo(DictSegment o) {
- // 对当前节点存储的char进行比较
- return this.nodeChar.compareTo(o.nodeChar);
- }
+ /**
+ * 实现Comparable接口
+ * @param o
+ * @return int
+ */
+ @Override
+ public int compareTo(DictSegment o) {
+ // 对当前节点存储的char进行比较
+ return this.nodeChar.compareTo(o.nodeChar);
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java b/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java
index a40ffa3..51fa7b2 100644
--- a/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java
+++ b/src/main/java/com/rymcu/forest/lucene/dic/Dictionary.java
@@ -28,320 +28,344 @@ import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.List;
-/** 词典管理类,单例模式 */
+/**
+ * 词典管理类,单例模式
+ */
public class Dictionary {
- /** 词典单例 */
- private static Dictionary singleton;
- /** 主词典对象 */
- private DictSegment _MainDict;
- /** 停止词词典 */
- private DictSegment _StopWordDict;
- /** 量词词典 */
- private DictSegment _QuantifierDict;
- /** 用户自定义词典路径 */
- private static final String PATH_USER_DIC =
- System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
- /** 配置对象 */
- private final Configuration cfg;
+ /**
+ * 词典单例
+ */
+ private static Dictionary singleton;
+ /**
+ * 主词典对象
+ */
+ private DictSegment _MainDict;
+ /**
+ * 停止词词典
+ */
+ private DictSegment _StopWordDict;
+ /**
+ * 量词词典
+ */
+ private DictSegment _QuantifierDict;
+ /**
+ * 用户自定义词典路径
+ */
+ private static final String PATH_USER_DIC =
+ System.getProperty("user.dir") + "/lucene/userDic/userDic.dic";
+ /**
+ * 配置对象
+ */
+ private final Configuration cfg;
- private Dictionary(Configuration cfg) {
- this.cfg = cfg;
- this.loadMainDict();
- this.loadStopWordDict();
- this.loadQuantifierDict();
- }
+ private Dictionary(Configuration cfg) {
+ this.cfg = cfg;
+ this.loadMainDict();
+ this.loadStopWordDict();
+ this.loadQuantifierDict();
+ }
- /**
- * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间
- * 该方法提供了一个在应用加载阶段就初始化字典的手段
- */
- public static void initial(Configuration cfg) {
- if (singleton == null) {
- synchronized (Dictionary.class) {
+ /**
+ * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间
+ * 该方法提供了一个在应用加载阶段就初始化字典的手段
+ */
+ public static void initial(Configuration cfg) {
if (singleton == null) {
- singleton = new Dictionary(cfg);
+ synchronized (Dictionary.class) {
+ if (singleton == null) {
+ singleton = new Dictionary(cfg);
+ }
+ }
}
- }
}
- }
- /**
- * 获取词典单子实例
- *
- * @return Dictionary 单例对象
- */
- public static Dictionary getSingleton() {
- if (singleton == null) {
- throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
- }
- return singleton;
- }
-
- /**
- * 批量加载新词条
- *
- * @param words Collection词条列表
- */
- public void addWords(Collection words) {
- if (words != null) {
- for (String word : words) {
- if (word != null) {
- // 批量加载词条到主内存词典中
- singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
+ /**
+ * 获取词典单子实例
+ *
+ * @return Dictionary 单例对象
+ */
+ public static Dictionary getSingleton() {
+ if (singleton == null) {
+ throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
}
- }
+ return singleton;
}
- }
- /**
- * 批量移除(屏蔽)词条
- *
- * @param words
- */
- public void disableWords(Collection words) {
- if (words != null) {
- for (String word : words) {
- if (word != null) {
- // 批量屏蔽词条
- singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
+ /**
+ * 批量加载新词条
+ *
+ * @param words Collection词条列表
+ */
+ public void addWords(Collection words) {
+ if (words != null) {
+ for (String word : words) {
+ if (word != null) {
+ // 批量加载词条到主内存词典中
+ singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
+ }
+ }
}
- }
}
- }
- /**
- * 检索匹配主词典
- *
- * @param charArray
- * @return Hit 匹配结果描述
- */
- public Hit matchInMainDict(char[] charArray) {
- return singleton._MainDict.match(charArray);
- }
-
- /**
- * 检索匹配主词典
- *
- * @param charArray
- * @param begin
- * @param length
- * @return Hit 匹配结果描述
- */
- public Hit matchInMainDict(char[] charArray, int begin, int length) {
- return singleton._MainDict.match(charArray, begin, length);
- }
-
- /**
- * 检索匹配量词词典
- *
- * @param charArray
- * @param begin
- * @param length
- * @return Hit 匹配结果描述
- */
- public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
- return singleton._QuantifierDict.match(charArray, begin, length);
- }
-
- /**
- * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
- *
- * @param charArray
- * @param currentIndex
- * @param matchedHit
- * @return Hit
- */
- public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
- DictSegment ds = matchedHit.getMatchedDictSegment();
- return ds.match(charArray, currentIndex, 1, matchedHit);
- }
-
- /**
- * 判断是否是停止词
- *
- * @param charArray
- * @param begin
- * @param length
- * @return boolean
- */
- public boolean isStopWord(char[] charArray, int begin, int length) {
- return singleton._StopWordDict.match(charArray, begin, length).isMatch();
- }
-
- /** 加载主词典及扩展词典 */
- private void loadMainDict() {
- // 建立一个主词典实例
- _MainDict = new DictSegment((char) 0);
- // 读取主词典文件
- Resource resource = new ClassPathResource(cfg.getMainDictionary());
- try {
- InputStream is = resource.getInputStream();
- BufferedReader br =
- new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ /**
+ * 批量移除(屏蔽)词条
+ *
+ * @param words
+ */
+ public void disableWords(Collection words) {
+ if (words != null) {
+ for (String word : words) {
+ if (word != null) {
+ // 批量屏蔽词条
+ singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
+ }
+ }
}
- } while (theWord != null);
- } catch (IOException e) {
- System.err.println("Main Dictionary loading exception.");
- e.printStackTrace();
}
- // 加载扩展词典
- this.loadExtDict();
- }
- /** 加载用户配置的扩展词典到主词库表 */
- private void loadExtDict() {
- // 加载扩展词典配置
- List extDictFiles = cfg.getExtDictionary();
- if (extDictFiles != null) {
- InputStream is;
- for (String extDictName : extDictFiles) {
- // 读取扩展词典文件
- System.out.println("加载扩展词典:" + extDictName);
- is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
- // 如果找不到扩展的字典,则忽略
+ /**
+ * 检索匹配主词典
+ *
+ * @param charArray
+ * @return Hit 匹配结果描述
+ */
+ public Hit matchInMainDict(char[] charArray) {
+ return singleton._MainDict.match(charArray);
+ }
+
+ /**
+ * 检索匹配主词典
+ *
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return Hit 匹配结果描述
+ */
+ public Hit matchInMainDict(char[] charArray, int begin, int length) {
+ return singleton._MainDict.match(charArray, begin, length);
+ }
+
+ /**
+ * 检索匹配量词词典
+ *
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return Hit 匹配结果描述
+ */
+ public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
+ return singleton._QuantifierDict.match(charArray, begin, length);
+ }
+
+ /**
+ * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
+ *
+ * @param charArray
+ * @param currentIndex
+ * @param matchedHit
+ * @return Hit
+ */
+ public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
+ DictSegment ds = matchedHit.getMatchedDictSegment();
+ return ds.match(charArray, currentIndex, 1, matchedHit);
+ }
+
+ /**
+ * 判断是否是停止词
+ *
+ * @param charArray
+ * @param begin
+ * @param length
+ * @return boolean
+ */
+ public boolean isStopWord(char[] charArray, int begin, int length) {
+ return singleton._StopWordDict.match(charArray, begin, length).isMatch();
+ }
+
+ /**
+ * 加载主词典及扩展词典
+ */
+ private void loadMainDict() {
+ // 建立一个主词典实例
+ _MainDict = new DictSegment((char) 0);
+ // 读取主词典文件
+ Resource resource = new ClassPathResource(cfg.getMainDictionary());
+ try {
+ InputStream is = resource.getInputStream();
+ BufferedReader br =
+ new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
+ String theWord;
+ do {
+ theWord = br.readLine();
+ if (theWord != null && !"".equals(theWord.trim())) {
+ _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ } while (theWord != null);
+ } catch (IOException e) {
+ System.err.println("Main Dictionary loading exception.");
+ e.printStackTrace();
+ }
+ // 加载扩展词典
+ this.loadExtDict();
+ }
+
+ /**
+ * 加载用户配置的扩展词典到主词库表
+ */
+ private void loadExtDict() {
+ // 加载扩展词典配置
+ List extDictFiles = cfg.getExtDictionary();
+ if (extDictFiles != null) {
+ InputStream is;
+ for (String extDictName : extDictFiles) {
+ // 读取扩展词典文件
+ System.out.println("加载扩展词典:" + extDictName);
+ is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
+ // 如果找不到扩展的字典,则忽略
+ if (is == null) {
+ try {
+ is = new FileInputStream(extDictName);
+ } catch (FileNotFoundException e) {
+ continue;
+ }
+ }
+ try {
+ BufferedReader br =
+ new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
+ String theWord;
+ do {
+ theWord = br.readLine();
+ if (theWord != null && !"".equals(theWord.trim())) {
+ // 加载扩展词典数据到主内存词典中
+ System.out.println(theWord);
+ _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ } while (theWord != null);
+ } catch (IOException ioe) {
+ System.err.println("Extension Dictionary loading exception.");
+ ioe.printStackTrace();
+ } finally {
+ try {
+ is.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * 加载用户扩展的停止词词典
+ */
+ private void loadStopWordDict() {
+ // 建立一个主词典实例
+ _StopWordDict = new DictSegment((char) 0);
+ // 加载扩展停止词典
+ List extStopWordDictFiles = cfg.getExtStopWordDictionary();
+ if (extStopWordDictFiles != null) {
+ InputStream is = null;
+ for (String extStopWordDictName : extStopWordDictFiles) {
+ System.out.println("加载扩展停止词典:" + extStopWordDictName);
+ // 读取扩展词典文件
+ is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
+ // 如果找不到扩展的字典,则忽略
+ if (is == null) {
+ continue;
+ }
+ try {
+ BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
+ String theWord;
+ do {
+ theWord = br.readLine();
+ if (theWord != null && !"".equals(theWord.trim())) {
+ // 加载扩展停止词典数据到内存中
+ _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ } while (theWord != null);
+ } catch (IOException ioe) {
+ System.err.println("Extension Stop word Dictionary loading exception.");
+ ioe.printStackTrace();
+ } finally {
+ try {
+ is.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * 加载量词词典
+ */
+ private void loadQuantifierDict() {
+ // 建立一个量词典实例
+ _QuantifierDict = new DictSegment((char) 0);
+ // 读取量词词典文件
+ InputStream is =
+ this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
if (is == null) {
- try {
- is = new FileInputStream(extDictName);
- } catch (FileNotFoundException e) {
- continue;
- }
+ throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
- BufferedReader br =
- new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- // 加载扩展词典数据到主内存词典中
- System.out.println(theWord);
- _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
- }
- } while (theWord != null);
+ BufferedReader br =
+ new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
+ String theWord;
+ do {
+ theWord = br.readLine();
+ if (theWord != null && !"".equals(theWord.trim())) {
+ _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ } while (theWord != null);
} catch (IOException ioe) {
- System.err.println("Extension Dictionary loading exception.");
- ioe.printStackTrace();
+ System.err.println("Quantifier Dictionary loading exception.");
+ ioe.printStackTrace();
} finally {
- try {
- is.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
+ try {
+ is.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
- }
}
- }
- /** 加载用户扩展的停止词词典 */
- private void loadStopWordDict() {
- // 建立一个主词典实例
- _StopWordDict = new DictSegment((char) 0);
- // 加载扩展停止词典
- List extStopWordDictFiles = cfg.getExtStopWordDictionary();
- if (extStopWordDictFiles != null) {
- InputStream is = null;
- for (String extStopWordDictName : extStopWordDictFiles) {
- System.out.println("加载扩展停止词典:" + extStopWordDictName);
+ /**
+ * 加载用户配置的自定义扩展词典到主词库表
+ */
+ public void updateUserDict() {
+ // 加载扩展词典配置
+ InputStream is;
// 读取扩展词典文件
- is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
- // 如果找不到扩展的字典,则忽略
- if (is == null) {
- continue;
+ System.out.println("更新加载扩展词典:" + PATH_USER_DIC);
+ try {
+ is = new FileInputStream(PATH_USER_DIC);
+ } catch (FileNotFoundException e) {
+ return;
}
try {
- BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- // 加载扩展停止词典数据到内存中
- _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
- }
- } while (theWord != null);
+ BufferedReader br =
+ new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
+ String theWord;
+ do {
+ theWord = br.readLine();
+ if (theWord != null && !"".equals(theWord.trim())) {
+ // 加载扩展词典数据到主内存词典中
+ System.out.println(theWord);
+ _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
+ }
+ } while (theWord != null);
} catch (IOException ioe) {
- System.err.println("Extension Stop word Dictionary loading exception.");
- ioe.printStackTrace();
+ System.err.println("Extension Dictionary loading exception.");
+ ioe.printStackTrace();
} finally {
- try {
- is.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
+ try {
+ is.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
- }
}
- }
-
- /** 加载量词词典 */
- private void loadQuantifierDict() {
- // 建立一个量词典实例
- _QuantifierDict = new DictSegment((char) 0);
- // 读取量词词典文件
- InputStream is =
- this.getClass().getClassLoader().getResourceAsStream(cfg.getQuantifierDictionary());
- if (is == null) {
- throw new RuntimeException("Quantifier Dictionary not found!!!");
- }
- try {
- BufferedReader br =
- new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
- }
- } while (theWord != null);
- } catch (IOException ioe) {
- System.err.println("Quantifier Dictionary loading exception.");
- ioe.printStackTrace();
- } finally {
- try {
- is.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
-
- /** 加载用户配置的自定义扩展词典到主词库表 */
- public void updateUserDict() {
- // 加载扩展词典配置
- InputStream is;
- // 读取扩展词典文件
- System.out.println("更新加载扩展词典:" + PATH_USER_DIC);
- try {
- is = new FileInputStream(PATH_USER_DIC);
- } catch (FileNotFoundException e) {
- return;
- }
- try {
- BufferedReader br =
- new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8), 512);
- String theWord;
- do {
- theWord = br.readLine();
- if (theWord != null && !"".equals(theWord.trim())) {
- // 加载扩展词典数据到主内存词典中
- System.out.println(theWord);
- _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
- }
- } while (theWord != null);
- } catch (IOException ioe) {
- System.err.println("Extension Dictionary loading exception.");
- ioe.printStackTrace();
- } finally {
- try {
- is.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/dic/Hit.java b/src/main/java/com/rymcu/forest/lucene/dic/Hit.java
index f99769c..23e4abc 100644
--- a/src/main/java/com/rymcu/forest/lucene/dic/Hit.java
+++ b/src/main/java/com/rymcu/forest/lucene/dic/Hit.java
@@ -1,27 +1,25 @@
/**
- *
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.dic;
@@ -29,91 +27,91 @@ package com.rymcu.forest.lucene.dic;
* 表示一次词典匹配的命中
*/
public class Hit {
- // Hit不匹配
- private static final int UNMATCH = 0x00000000;
- // Hit完全匹配
- private static final int MATCH = 0x00000001;
- // Hit前缀匹配
- private static final int PREFIX = 0x00000010;
+ // Hit不匹配
+ private static final int UNMATCH = 0x00000000;
+ // Hit完全匹配
+ private static final int MATCH = 0x00000001;
+ // Hit前缀匹配
+ private static final int PREFIX = 0x00000010;
- // 该HIT当前状态,默认未匹配
- private int hitState = UNMATCH;
+ // 该HIT当前状态,默认未匹配
+ private int hitState = UNMATCH;
- // 记录词典匹配过程中,当前匹配到的词典分支节点
- private DictSegment matchedDictSegment;
- /*
- * 词段开始位置
- */
- private int begin;
- /*
- * 词段的结束位置
- */
- private int end;
+ // 记录词典匹配过程中,当前匹配到的词典分支节点
+ private DictSegment matchedDictSegment;
+ /*
+ * 词段开始位置
+ */
+ private int begin;
+ /*
+ * 词段的结束位置
+ */
+ private int end;
- /**
- * 判断是否完全匹配
- */
- public boolean isMatch() {
- return (this.hitState & MATCH) > 0;
- }
+ /**
+ * 判断是否完全匹配
+ */
+ public boolean isMatch() {
+ return (this.hitState & MATCH) > 0;
+ }
- /**
- *
- */
- public void setMatch() {
- this.hitState = this.hitState | MATCH;
- }
+ /**
+ *
+ */
+ public void setMatch() {
+ this.hitState = this.hitState | MATCH;
+ }
- /**
- * 判断是否是词的前缀
- */
- public boolean isPrefix() {
- return (this.hitState & PREFIX) > 0;
- }
+ /**
+ * 判断是否是词的前缀
+ */
+ public boolean isPrefix() {
+ return (this.hitState & PREFIX) > 0;
+ }
- /**
- *
- */
- public void setPrefix() {
- this.hitState = this.hitState | PREFIX;
- }
+ /**
+ *
+ */
+ public void setPrefix() {
+ this.hitState = this.hitState | PREFIX;
+ }
- /**
- * 判断是否是不匹配
- */
- public boolean isUnmatch() {
- return this.hitState == UNMATCH;
- }
+ /**
+ * 判断是否是不匹配
+ */
+ public boolean isUnmatch() {
+ return this.hitState == UNMATCH;
+ }
- /**
- *
- */
- public void setUnmatch() {
- this.hitState = UNMATCH;
- }
+ /**
+ *
+ */
+ public void setUnmatch() {
+ this.hitState = UNMATCH;
+ }
- public DictSegment getMatchedDictSegment() {
- return matchedDictSegment;
- }
+ public DictSegment getMatchedDictSegment() {
+ return matchedDictSegment;
+ }
- public void setMatchedDictSegment(DictSegment matchedDictSegment) {
- this.matchedDictSegment = matchedDictSegment;
- }
+ public void setMatchedDictSegment(DictSegment matchedDictSegment) {
+ this.matchedDictSegment = matchedDictSegment;
+ }
- public int getBegin() {
- return begin;
- }
+ public int getBegin() {
+ return begin;
+ }
- public void setBegin(int begin) {
- this.begin = begin;
- }
+ public void setBegin(int begin) {
+ this.begin = begin;
+ }
- public int getEnd() {
- return end;
- }
+ public int getEnd() {
+ return end;
+ }
- public void setEnd(int end) {
- this.end = end;
- }
+ public void setEnd(int end) {
+ this.end = end;
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java b/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java
index 9de61c1..65d072b 100644
--- a/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java
+++ b/src/main/java/com/rymcu/forest/lucene/lucene/ArticleBeanIndex.java
@@ -19,29 +19,29 @@ import java.util.concurrent.CountDownLatch;
*/
public class ArticleBeanIndex extends BaseIndex {
- public ArticleBeanIndex(
- String parentIndexPath,
- int subIndex,
- CountDownLatch countDownLatch1,
- CountDownLatch countDownLatch2,
- List list) {
- super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
- }
-
- @Override
- public void indexDoc(IndexWriter writer, ArticleLucene t) throws Exception {
- Document doc = new Document();
- Field id = new Field("id", t.getIdArticle() + "", TextField.TYPE_STORED);
- Field title = new Field("title", t.getArticleTitle(), TextField.TYPE_STORED);
- Field summary = new Field("summary", t.getArticleContent(), TextField.TYPE_STORED);
- // 添加到Document中
- doc.add(id);
- doc.add(title);
- doc.add(summary);
- if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
- writer.addDocument(doc);
- } else {
- writer.updateDocument(new Term("id", t.getIdArticle() + ""), doc);
+ public ArticleBeanIndex(
+ String parentIndexPath,
+ int subIndex,
+ CountDownLatch countDownLatch1,
+ CountDownLatch countDownLatch2,
+ List list) {
+ super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
+ }
+
+ @Override
+ public void indexDoc(IndexWriter writer, ArticleLucene t) throws Exception {
+ Document doc = new Document();
+ Field id = new Field("id", t.getIdArticle() + "", TextField.TYPE_STORED);
+ Field title = new Field("title", t.getArticleTitle(), TextField.TYPE_STORED);
+ Field summary = new Field("summary", t.getArticleContent(), TextField.TYPE_STORED);
+ // 添加到Document中
+ doc.add(id);
+ doc.add(title);
+ doc.add(summary);
+ if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
+ writer.addDocument(doc);
+ } else {
+ writer.updateDocument(new Term("id", t.getIdArticle() + ""), doc);
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java b/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java
index ecc8911..9d57cca 100644
--- a/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java
+++ b/src/main/java/com/rymcu/forest/lucene/lucene/BaseIndex.java
@@ -16,125 +16,136 @@ import java.util.concurrent.CountDownLatch;
* @date 2021/2/2 14:14
*/
public abstract class BaseIndex implements Runnable {
- /** 父级索引路径 */
- private String parentIndexPath;
- /** 索引编写器 */
- private IndexWriter writer;
+ /**
+ * 父级索引路径
+ */
+ private String parentIndexPath;
+ /**
+ * 索引编写器
+ */
+ private IndexWriter writer;
- private int subIndex;
- /** 主线程 */
- private final CountDownLatch countDownLatch1;
- /** 工作线程 */
- private final CountDownLatch countDownLatch2;
- /** 对象列表 */
- private List list;
+ private int subIndex;
+ /**
+ * 主线程
+ */
+ private final CountDownLatch countDownLatch1;
+ /**
+ * 工作线程
+ */
+ private final CountDownLatch countDownLatch2;
+ /**
+ * 对象列表
+ */
+ private List list;
- public BaseIndex(String parentIndexPath, int subIndex) {
- this.parentIndexPath = parentIndexPath;
- this.subIndex = subIndex;
- try {
- this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
- } catch (IOException e) {
- e.printStackTrace();
- }
- this.countDownLatch1 = null;
- this.countDownLatch2 = null;
- }
-
- public BaseIndex(
- IndexWriter writer,
- CountDownLatch countDownLatch1,
- CountDownLatch countDownLatch2,
- List list) {
- super();
- this.writer = writer;
- this.countDownLatch1 = countDownLatch1;
- this.countDownLatch2 = countDownLatch2;
- this.list = list;
- }
-
- public BaseIndex(
- String parentIndexPath,
- int subIndex,
- CountDownLatch countDownLatch1,
- CountDownLatch countDownLatch2,
- List list) {
- super();
- this.parentIndexPath = parentIndexPath;
- this.subIndex = subIndex;
- try {
- // 多目录索引创建
- File file = new File(parentIndexPath + "/index" + subIndex);
- if (!file.exists()) {
- file.mkdir();
- }
- this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
- } catch (IOException e) {
- e.printStackTrace();
+ public BaseIndex(String parentIndexPath, int subIndex) {
+ this.parentIndexPath = parentIndexPath;
+ this.subIndex = subIndex;
+ try {
+ this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ this.countDownLatch1 = null;
+ this.countDownLatch2 = null;
}
- this.subIndex = subIndex;
- this.countDownLatch1 = countDownLatch1;
- this.countDownLatch2 = countDownLatch2;
- this.list = list;
- }
-
- public BaseIndex(
- String path, CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, List list) {
- super();
- try {
- // 单目录索引创建
- File file = new File(path);
- if (!file.exists()) {
- file.mkdir();
- }
- this.writer = IndexUtil.getIndexWriter(path, true);
- } catch (IOException e) {
- e.printStackTrace();
+ public BaseIndex(
+ IndexWriter writer,
+ CountDownLatch countDownLatch1,
+ CountDownLatch countDownLatch2,
+ List list) {
+ super();
+ this.writer = writer;
+ this.countDownLatch1 = countDownLatch1;
+ this.countDownLatch2 = countDownLatch2;
+ this.list = list;
}
- ;
- this.countDownLatch1 = countDownLatch1;
- this.countDownLatch2 = countDownLatch2;
- this.list = list;
- }
- /**
- * 创建索引
- *
- * @param writer
- * @throws IOException
- * @throws ParseException
- */
- public abstract void indexDoc(IndexWriter writer, T t) throws Exception;
- /**
- * 批量索引创建
- *
- * @param writer
- * @param t
- * @throws Exception
- */
- public void indexDocs(IndexWriter writer, List t) throws Exception {
- for (T t2 : t) {
- indexDoc(writer, t2);
- }
- }
+ public BaseIndex(
+ String parentIndexPath,
+ int subIndex,
+ CountDownLatch countDownLatch1,
+ CountDownLatch countDownLatch2,
+ List list) {
+ super();
+ this.parentIndexPath = parentIndexPath;
+ this.subIndex = subIndex;
+ try {
+ // 多目录索引创建
+ File file = new File(parentIndexPath + "/index" + subIndex);
+ if (!file.exists()) {
+ file.mkdir();
+ }
+ this.writer = IndexUtil.getIndexWriter(parentIndexPath + "/index" + subIndex, true);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
- @Override
- public void run() {
- try {
- countDownLatch1.await();
- System.out.println(writer);
- indexDocs(writer, list);
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- countDownLatch2.countDown();
- try {
- writer.commit();
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
+ this.subIndex = subIndex;
+ this.countDownLatch1 = countDownLatch1;
+ this.countDownLatch2 = countDownLatch2;
+ this.list = list;
+ }
+
+ public BaseIndex(
+ String path, CountDownLatch countDownLatch1, CountDownLatch countDownLatch2, List list) {
+ super();
+ try {
+ // 单目录索引创建
+ File file = new File(path);
+ if (!file.exists()) {
+ file.mkdir();
+ }
+ this.writer = IndexUtil.getIndexWriter(path, true);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ ;
+ this.countDownLatch1 = countDownLatch1;
+ this.countDownLatch2 = countDownLatch2;
+ this.list = list;
+ }
+
+ /**
+ * 创建索引
+ *
+ * @param writer
+ * @throws IOException
+ * @throws ParseException
+ */
+ public abstract void indexDoc(IndexWriter writer, T t) throws Exception;
+
+ /**
+ * 批量索引创建
+ *
+ * @param writer
+ * @param t
+ * @throws Exception
+ */
+ public void indexDocs(IndexWriter writer, List t) throws Exception {
+ for (T t2 : t) {
+ indexDoc(writer, t2);
+ }
+ }
+
+ @Override
+ public void run() {
+ try {
+ countDownLatch1.await();
+ System.out.println(writer);
+ indexDocs(writer, list);
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ countDownLatch2.countDown();
+ try {
+ writer.commit();
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/IKAnalyzer.java b/src/main/java/com/rymcu/forest/lucene/lucene/IKAnalyzer.java
index 5053459..b9f5a65 100644
--- a/src/main/java/com/rymcu/forest/lucene/lucene/IKAnalyzer.java
+++ b/src/main/java/com/rymcu/forest/lucene/lucene/IKAnalyzer.java
@@ -1,26 +1,25 @@
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
- *
+ *
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
+ *
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
- *
*/
package com.rymcu.forest.lucene.lucene;
@@ -32,44 +31,44 @@ import org.apache.lucene.analysis.Tokenizer;
*/
public final class IKAnalyzer extends Analyzer {
- private boolean useSmart;
+ private boolean useSmart;
- public boolean useSmart() {
- return useSmart;
- }
+ public boolean useSmart() {
+ return useSmart;
+ }
- public void setUseSmart(boolean useSmart) {
- this.useSmart = useSmart;
- }
+ public void setUseSmart(boolean useSmart) {
+ this.useSmart = useSmart;
+ }
- /**
- * IK分词器Lucene Analyzer接口实现类
- *
- * 默认细粒度切分算法
- */
- public IKAnalyzer() {
- this(false);
- }
+ /**
+ * IK分词器Lucene Analyzer接口实现类
+ *
+ * 默认细粒度切分算法
+ */
+ public IKAnalyzer() {
+ this(false);
+ }
- /**
- * IK分词器Lucene Analyzer接口实现类
- *
- * @param useSmart 当为true时,分词器进行智能切分
- */
- public IKAnalyzer(boolean useSmart) {
- super();
- this.useSmart = useSmart;
- }
+ /**
+ * IK分词器Lucene Analyzer接口实现类
+ *
+ * @param useSmart 当为true时,分词器进行智能切分
+ */
+ public IKAnalyzer(boolean useSmart) {
+ super();
+ this.useSmart = useSmart;
+ }
- /**
- * lucene 6.0
- * 重载Analyzer接口,构造分词组件
- */
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer IKTokenizer = new IKTokenizer(this.useSmart());
- return new Analyzer.TokenStreamComponents(IKTokenizer);
- }
+ /**
+ * lucene 6.0
+ * 重载Analyzer接口,构造分词组件
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer IKTokenizer = new IKTokenizer(this.useSmart());
+ return new Analyzer.TokenStreamComponents(IKTokenizer);
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/IKTokenizer.java b/src/main/java/com/rymcu/forest/lucene/lucene/IKTokenizer.java
index cb439f8..a5cd966 100644
--- a/src/main/java/com/rymcu/forest/lucene/lucene/IKTokenizer.java
+++ b/src/main/java/com/rymcu/forest/lucene/lucene/IKTokenizer.java
@@ -55,6 +55,7 @@ public final class IKTokenizer extends Tokenizer {
/**
* Lucene 6.0 Tokenizer适配器类构造函数
+ *
* @param useSmart
*/
public IKTokenizer(boolean useSmart) {
@@ -70,6 +71,7 @@ public final class IKTokenizer extends Tokenizer {
/**
* lucene 6.0 新增
* 方便创建 工厂类
+ *
* @param factory
* @param useSmart
*/
diff --git a/src/main/java/com/rymcu/forest/lucene/lucene/PortfolioBeanIndex.java b/src/main/java/com/rymcu/forest/lucene/lucene/PortfolioBeanIndex.java
index 544bb74..7a465ab 100644
--- a/src/main/java/com/rymcu/forest/lucene/lucene/PortfolioBeanIndex.java
+++ b/src/main/java/com/rymcu/forest/lucene/lucene/PortfolioBeanIndex.java
@@ -19,29 +19,29 @@ import java.util.concurrent.CountDownLatch;
*/
public class PortfolioBeanIndex extends BaseIndex {
- public PortfolioBeanIndex(
- String parentIndexPath,
- int subIndex,
- CountDownLatch countDownLatch1,
- CountDownLatch countDownLatch2,
- List list) {
- super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
- }
-
- @Override
- public void indexDoc(IndexWriter writer, PortfolioLucene user) throws Exception {
- Document doc = new Document();
- Field id = new Field("id", user.getIdPortfolio() + "", TextField.TYPE_STORED);
- Field title = new Field("title", user.getPortfolioTitle(), TextField.TYPE_STORED);
- Field summary = new Field("summary", user.getPortfolioDescription(), TextField.TYPE_STORED);
- // 添加到Document中
- doc.add(id);
- doc.add(title);
- doc.add(summary);
- if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
- writer.addDocument(doc);
- } else {
- writer.updateDocument(new Term("id", user.getIdPortfolio() + ""), doc);
+ public PortfolioBeanIndex(
+ String parentIndexPath,
+ int subIndex,
+ CountDownLatch countDownLatch1,
+ CountDownLatch countDownLatch2,
+ List list) {
+ super(parentIndexPath, subIndex, countDownLatch1, countDownLatch2, list);
+ }
+
+ @Override
+ public void indexDoc(IndexWriter writer, PortfolioLucene user) throws Exception {
+ Document doc = new Document();
+ Field id = new Field("id", user.getIdPortfolio() + "", TextField.TYPE_STORED);
+ Field title = new Field("title", user.getPortfolioTitle(), TextField.TYPE_STORED);
+ Field summary = new Field("summary", user.getPortfolioDescription(), TextField.TYPE_STORED);
+ // 添加到Document中
+ doc.add(id);
+ doc.add(title);
+ doc.add(summary);
+ if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
+ writer.addDocument(doc);
+ } else {
+ writer.updateDocument(new Term("id", user.getIdPortfolio() + ""), doc);
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/mapper/ArticleLuceneMapper.java b/src/main/java/com/rymcu/forest/lucene/mapper/ArticleLuceneMapper.java
index 6ecb2a3..15629c5 100644
--- a/src/main/java/com/rymcu/forest/lucene/mapper/ArticleLuceneMapper.java
+++ b/src/main/java/com/rymcu/forest/lucene/mapper/ArticleLuceneMapper.java
@@ -16,28 +16,28 @@ import java.util.List;
@Mapper
public interface ArticleLuceneMapper {
- /**
- * 加载所有文章内容
- *
- * @return
- */
- List getAllArticleLucene();
+ /**
+ * 加载所有文章内容
+ *
+ * @return
+ */
+ List getAllArticleLucene();
- /**
- * 加载所有文章内容
- *
- * @param ids 文章id(半角逗号分隔)
- * @return
- */
- List getArticlesByIds(@Param("ids") Long[] ids);
+ /**
+ * 加载所有文章内容
+ *
+ * @param ids 文章id(半角逗号分隔)
+ * @return
+ */
+ List getArticlesByIds(@Param("ids") Long[] ids);
- /**
- * 加载文章内容
- *
- * @param id 文章id
- * @return
- */
- ArticleLucene getById(@Param("id") Long id);
+ /**
+ * 加载文章内容
+ *
+ * @param id 文章id
+ * @return
+ */
+ ArticleLucene getById(@Param("id") Long id);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/mapper/PortfolioLuceneMapper.java b/src/main/java/com/rymcu/forest/lucene/mapper/PortfolioLuceneMapper.java
index c44ca01..15ce13d 100644
--- a/src/main/java/com/rymcu/forest/lucene/mapper/PortfolioLuceneMapper.java
+++ b/src/main/java/com/rymcu/forest/lucene/mapper/PortfolioLuceneMapper.java
@@ -16,26 +16,26 @@ import java.util.List;
@Mapper
public interface PortfolioLuceneMapper {
- /**
- * 加载所有作品集信息
- *
- * @return
- */
- List getAllPortfolioLucene();
+ /**
+ * 加载所有作品集信息
+ *
+ * @return
+ */
+ List getAllPortfolioLucene();
- /**
- * 加载所有作品集信息
- *
- * @param ids 作品集id(半角逗号分隔)
- * @return
- */
- List getPortfoliosByIds(@Param("ids") Long[] ids);
+ /**
+ * 加载所有作品集信息
+ *
+ * @param ids 作品集id(半角逗号分隔)
+ * @return
+ */
+ List getPortfoliosByIds(@Param("ids") Long[] ids);
- /**
- * 加载作品集
- *
- * @param id 用户id
- * @return
- */
- PortfolioLucene getById(@Param("id") Long id);
+ /**
+ * 加载作品集
+ *
+ * @param id 用户id
+ * @return
+ */
+ PortfolioLucene getById(@Param("id") Long id);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/mapper/UserDicMapper.java b/src/main/java/com/rymcu/forest/lucene/mapper/UserDicMapper.java
index b5fee46..324d54e 100644
--- a/src/main/java/com/rymcu/forest/lucene/mapper/UserDicMapper.java
+++ b/src/main/java/com/rymcu/forest/lucene/mapper/UserDicMapper.java
@@ -15,39 +15,39 @@ import java.util.List;
@Mapper
public interface UserDicMapper {
- /**
- * 加载所有字典
- *
- * @return
- */
- List getAllDic();
+ /**
+ * 加载所有字典
+ *
+ * @return
+ */
+ List getAllDic();
- /**
- * 加载所有字典信息
- *
- * @return
- */
- List getAll();
+ /**
+ * 加载所有字典信息
+ *
+ * @return
+ */
+ List getAll();
- /**
- * 增加字典
- *
- * @return
- */
- void addDic(@Param("dic") String userDic);
+ /**
+ * 增加字典
+ *
+ * @return
+ */
+ void addDic(@Param("dic") String userDic);
- /**
- * 删除字典
- *
- * @param id
- */
- void deleteDic(@Param("id") String id);
+ /**
+ * 删除字典
+ *
+ * @param id
+ */
+ void deleteDic(@Param("id") String id);
- /**
- * 更新字典
- *
- * @param id
- * @param userDic
- */
- void updateDic(@Param("id") Integer id, @Param("dic") String userDic);
+ /**
+ * 更新字典
+ *
+ * @param id
+ * @param userDic
+ */
+ void updateDic(@Param("id") Integer id, @Param("dic") String userDic);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/mapper/UserLuceneMapper.java b/src/main/java/com/rymcu/forest/lucene/mapper/UserLuceneMapper.java
index ad84019..91a0677 100644
--- a/src/main/java/com/rymcu/forest/lucene/mapper/UserLuceneMapper.java
+++ b/src/main/java/com/rymcu/forest/lucene/mapper/UserLuceneMapper.java
@@ -16,26 +16,26 @@ import java.util.List;
@Mapper
public interface UserLuceneMapper {
- /**
- * 加载所有用户信息
- *
- * @return
- */
- List getAllUserLucene();
+ /**
+ * 加载所有用户信息
+ *
+ * @return
+ */
+ List getAllUserLucene();
- /**
- * 加载所有用户信息
- *
- * @param ids 用户id(半角逗号分隔)
- * @return
- */
- List getUsersByIds(@Param("ids") Long[] ids);
+ /**
+ * 加载所有用户信息
+ *
+ * @param ids 用户id(半角逗号分隔)
+ * @return
+ */
+ List getUsersByIds(@Param("ids") Long[] ids);
- /**
- * 加载 UserLucene
- *
- * @param id 用户id
- * @return
- */
- UserLucene getById(@Param("id") String id);
+ /**
+ * 加载 UserLucene
+ *
+ * @param id 用户id
+ * @return
+ */
+ UserLucene getById(@Param("id") String id);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/model/ArticleLucene.java b/src/main/java/com/rymcu/forest/lucene/model/ArticleLucene.java
index e870484..d17407c 100644
--- a/src/main/java/com/rymcu/forest/lucene/model/ArticleLucene.java
+++ b/src/main/java/com/rymcu/forest/lucene/model/ArticleLucene.java
@@ -1,6 +1,6 @@
package com.rymcu.forest.lucene.model;
-import com.fasterxml.jackson.annotation.JsonFormat;
+
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@@ -18,16 +18,23 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class ArticleLucene {
- /** 文章编号 */
- @JsonFormat(shape = JsonFormat.Shape.STRING)
- private Long idArticle;
+ /**
+ * 文章编号
+ */
+ private Long idArticle;
- /** 文章标题 */
- private String articleTitle;
+ /**
+ * 文章标题
+ */
+ private String articleTitle;
- /** 文章内容 */
- private String articleContent;
+ /**
+ * 文章内容
+ */
+ private String articleContent;
- /** 相关度评分 */
- private String score;
+ /**
+ * 相关度评分
+ */
+ private String score;
}
diff --git a/src/main/java/com/rymcu/forest/lucene/model/PortfolioLucene.java b/src/main/java/com/rymcu/forest/lucene/model/PortfolioLucene.java
index 81d6b58..d4cf9ff 100644
--- a/src/main/java/com/rymcu/forest/lucene/model/PortfolioLucene.java
+++ b/src/main/java/com/rymcu/forest/lucene/model/PortfolioLucene.java
@@ -1,6 +1,6 @@
package com.rymcu.forest.lucene.model;
-import com.fasterxml.jackson.annotation.JsonFormat;
+
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@@ -18,16 +18,23 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class PortfolioLucene {
- /** 作品集编号 */
- @JsonFormat(shape = JsonFormat.Shape.STRING)
- private Long idPortfolio;
+ /**
+ * 作品集编号
+ */
+ private Long idPortfolio;
- /** 作品集名称 */
- private String portfolioTitle;
+ /**
+ * 作品集名称
+ */
+ private String portfolioTitle;
- /** 作品集介绍 */
- private String portfolioDescription;
+ /**
+ * 作品集介绍
+ */
+ private String portfolioDescription;
- /** 相关度评分 */
- private String score;
+ /**
+ * 相关度评分
+ */
+ private String score;
}
diff --git a/src/main/java/com/rymcu/forest/lucene/model/UserDic.java b/src/main/java/com/rymcu/forest/lucene/model/UserDic.java
index 27aae52..b1950c3 100644
--- a/src/main/java/com/rymcu/forest/lucene/model/UserDic.java
+++ b/src/main/java/com/rymcu/forest/lucene/model/UserDic.java
@@ -15,11 +15,15 @@ import javax.persistence.Table;
@Data
@Table(name = "forest_lucene_user_dic")
public class UserDic {
- /** 主键 */
- @Id
- @GeneratedValue(generator = "JDBC")
- private Integer id;
+ /**
+ * 主键
+ */
+ @Id
+ @GeneratedValue(generator = "JDBC")
+ private Integer id;
- /** 字典 */
- private String dic;
+ /**
+ * 字典
+ */
+ private String dic;
}
diff --git a/src/main/java/com/rymcu/forest/lucene/model/UserLucene.java b/src/main/java/com/rymcu/forest/lucene/model/UserLucene.java
index e6d9e91..06fb8b1 100644
--- a/src/main/java/com/rymcu/forest/lucene/model/UserLucene.java
+++ b/src/main/java/com/rymcu/forest/lucene/model/UserLucene.java
@@ -1,6 +1,6 @@
package com.rymcu.forest.lucene.model;
-import com.fasterxml.jackson.annotation.JsonFormat;
+
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@@ -18,16 +18,23 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
public class UserLucene {
- /** 用户编号 */
- @JsonFormat(shape = JsonFormat.Shape.STRING)
- private Long idUser;
+ /**
+ * 用户编号
+ */
+ private Long idUser;
- /** 昵称 */
- private String nickname;
+ /**
+ * 昵称
+ */
+ private String nickname;
- /** 签名 */
- private String signature;
+ /**
+ * 签名
+ */
+ private String signature;
- /** 相关度评分 */
- private String score;
+ /**
+ * 相关度评分
+ */
+ private String score;
}
diff --git a/src/main/java/com/rymcu/forest/lucene/service/LuceneService.java b/src/main/java/com/rymcu/forest/lucene/service/LuceneService.java
index 6e9c6d7..4751247 100644
--- a/src/main/java/com/rymcu/forest/lucene/service/LuceneService.java
+++ b/src/main/java/com/rymcu/forest/lucene/service/LuceneService.java
@@ -13,63 +13,63 @@ import java.util.List;
*/
public interface LuceneService {
- /**
- * 将文章的数据解析为一个个关键字词存储到索引文件中
- *
- * @param list
- */
- void writeArticle(List list);
+ /**
+ * 将文章的数据解析为一个个关键字词存储到索引文件中
+ *
+ * @param list
+ */
+ void writeArticle(List list);
- /**
- * 写入单个文章索引
- *
- * @param id
- */
- void writeArticle(Long id);
+ /**
+ * 写入单个文章索引
+ *
+ * @param id
+ */
+ void writeArticle(Long id);
- /**
- * 写入单个文章索引
- *
- * @param articleLucene
- */
- void writeArticle(ArticleLucene articleLucene);
+ /**
+ * 写入单个文章索引
+ *
+ * @param articleLucene
+ */
+ void writeArticle(ArticleLucene articleLucene);
- /**
- * 更新单个文章索引
- *
- * @param id
- */
- void updateArticle(Long id);
+ /**
+ * 更新单个文章索引
+ *
+ * @param id
+ */
+ void updateArticle(Long id);
- /**
- * 删除单个文章索引
- *
- * @param id
- */
- void deleteArticle(Long id);
+ /**
+ * 删除单个文章索引
+ *
+ * @param id
+ */
+ void deleteArticle(Long id);
- /**
- * 关键词搜索
- *
- * @param value
- * @return
- * @throws Exception
- */
- List searchArticle(String value);
+ /**
+ * 关键词搜索
+ *
+ * @param value
+ * @return
+ * @throws Exception
+ */
+ List searchArticle(String value);
- /**
- * 加载所有文章内容
- *
- * @return
- */
- List getAllArticleLucene();
+ /**
+ * 加载所有文章内容
+ *
+ * @return
+ */
+ List getAllArticleLucene();
- /**
- * 加载所有文章内容
- *
- * @param ids 文章id(半角逗号分隔)
- * @return
- */
- List getArticlesByIds(Long[] ids);
+ /**
+ * 加载所有文章内容
+ *
+ * @param ids 文章id(半角逗号分隔)
+ * @return
+ */
+ List getArticlesByIds(Long[] ids);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/service/PortfolioLuceneService.java b/src/main/java/com/rymcu/forest/lucene/service/PortfolioLuceneService.java
index e532548..4c1b09c 100644
--- a/src/main/java/com/rymcu/forest/lucene/service/PortfolioLuceneService.java
+++ b/src/main/java/com/rymcu/forest/lucene/service/PortfolioLuceneService.java
@@ -13,62 +13,62 @@ import java.util.List;
*/
public interface PortfolioLuceneService {
- /**
- * 批量写入作品集信息到索引
- *
- * @param list
- */
- void writePortfolio(List list);
+ /**
+ * 批量写入作品集信息到索引
+ *
+ * @param list
+ */
+ void writePortfolio(List list);
- /**
- * 写入单个作品集索引
- *
- * @param id
- */
- void writePortfolio(Long id);
+ /**
+ * 写入单个作品集索引
+ *
+ * @param id
+ */
+ void writePortfolio(Long id);
- /**
- * 写入单个作品集索引
- *
- * @param portfolioLucene
- */
- void writePortfolio(PortfolioLucene portfolioLucene);
+ /**
+ * 写入单个作品集索引
+ *
+ * @param portfolioLucene
+ */
+ void writePortfolio(PortfolioLucene portfolioLucene);
- /**
- * 更新单个作品集索引
- *
- * @param id
- */
- void updatePortfolio(Long id);
+ /**
+ * 更新单个作品集索引
+ *
+ * @param id
+ */
+ void updatePortfolio(Long id);
- /**
- * 删除单个作品集索引
- *
- * @param id
- */
- void deletePortfolio(Long id);
+ /**
+ * 删除单个作品集索引
+ *
+ * @param id
+ */
+ void deletePortfolio(Long id);
- /**
- * 关键词搜索
- *
- * @param value
- * @return
- * @throws Exception
- */
- List searchPortfolio(String value);
+ /**
+ * 关键词搜索
+ *
+ * @param value
+ * @return
+ * @throws Exception
+ */
+ List searchPortfolio(String value);
- /**
- * 加载所有作品集内容
- *
- * @return
- */
- List getAllPortfolioLucene();
+ /**
+ * 加载所有作品集内容
+ *
+ * @return
+ */
+ List getAllPortfolioLucene();
- /**
- * 加载所有作品集内容
- *
- * @param ids 作品集id(半角逗号分隔)
- * @return
- */
- List getPortfoliosByIds(Long[] ids);
+ /**
+ * 加载所有作品集内容
+ *
+ * @param ids 作品集id(半角逗号分隔)
+ * @return
+ */
+ List getPortfoliosByIds(Long[] ids);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/service/UserDicService.java b/src/main/java/com/rymcu/forest/lucene/service/UserDicService.java
index beeade2..ca1d0a8 100644
--- a/src/main/java/com/rymcu/forest/lucene/service/UserDicService.java
+++ b/src/main/java/com/rymcu/forest/lucene/service/UserDicService.java
@@ -13,44 +13,43 @@ import java.util.List;
*/
public interface UserDicService {
- /**
- * 加载所有字典
- *
- * @return
- */
- List getAllDic();
+ /**
+ * 加载所有字典
+ *
+ * @return
+ */
+ List getAllDic();
- /**
- * 加载所有字典
- *
- * @return
- */
- List getAll();
+ /**
+ * 加载所有字典
+ *
+ * @return
+ */
+ List getAll();
- /**
- * 增加字典
- *
- * @return
- */
- void addDic(String dic);
+ /**
+ * 增加字典
+ *
+ * @return
+ */
+ void addDic(String dic);
- /**
- * 删除字典
- *
- * @param id
- */
- void deleteDic(String id);
+ /**
+ * 删除字典
+ *
+ * @param id
+ */
+ void deleteDic(String id);
- /**
- * 更新字典
- *
- * @param userDic
- */
- void updateDic(UserDic userDic);
+ /**
+ * 更新字典
+ *
+ * @param userDic
+ */
+ void updateDic(UserDic userDic);
- /**
- * 写入字典至内存
- *
- */
- void writeUserDic() throws FileNotFoundException;
+ /**
+ * 写入字典至内存
+ */
+ void writeUserDic() throws FileNotFoundException;
}
diff --git a/src/main/java/com/rymcu/forest/lucene/service/UserLuceneService.java b/src/main/java/com/rymcu/forest/lucene/service/UserLuceneService.java
index fd2c6bd..bb6be94 100644
--- a/src/main/java/com/rymcu/forest/lucene/service/UserLuceneService.java
+++ b/src/main/java/com/rymcu/forest/lucene/service/UserLuceneService.java
@@ -13,62 +13,62 @@ import java.util.List;
*/
public interface UserLuceneService {
- /**
- * 批量写入用户信息到索引
- *
- * @param list
- */
- void writeUser(List list);
+ /**
+ * 批量写入用户信息到索引
+ *
+ * @param list
+ */
+ void writeUser(List list);
- /**
- * 写入单个用户索引
- *
- * @param id
- */
- void writeUser(String id);
+ /**
+ * 写入单个用户索引
+ *
+ * @param id
+ */
+ void writeUser(String id);
- /**
- * 写入单个用户索引
- *
- * @param UserLucene
- */
- void writeUser(UserLucene UserLucene);
+ /**
+ * 写入单个用户索引
+ *
+ * @param UserLucene
+ */
+ void writeUser(UserLucene UserLucene);
- /**
- * 更新单个用户索引
- *
- * @param id
- */
- void updateUser(String id);
+ /**
+ * 更新单个用户索引
+ *
+ * @param id
+ */
+ void updateUser(String id);
- /**
- * 删除单个用户索引
- *
- * @param id
- */
- void deleteUser(String id);
+ /**
+ * 删除单个用户索引
+ *
+ * @param id
+ */
+ void deleteUser(String id);
- /**
- * 关键词搜索
- *
- * @param value
- * @return
- * @throws Exception
- */
- List searchUser(String value);
+ /**
+ * 关键词搜索
+ *
+ * @param value
+ * @return
+ * @throws Exception
+ */
+ List searchUser(String value);
- /**
- * 加载所有用户内容
- *
- * @return
- */
- List getAllUserLucene();
+ /**
+ * 加载所有用户内容
+ *
+ * @return
+ */
+ List getAllUserLucene();
- /**
- * 加载所有用户内容
- *
- * @param ids 用户id(半角逗号分隔)
- * @return
- */
- List getUsersByIds(Long[] ids);
+ /**
+ * 加载所有用户内容
+ *
+ * @param ids 用户id(半角逗号分隔)
+ * @return
+ */
+ List getUsersByIds(Long[] ids);
}
diff --git a/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java b/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java
index c59a1e9..852fbd9 100644
--- a/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java
+++ b/src/main/java/com/rymcu/forest/lucene/service/impl/UserDicServiceImpl.java
@@ -20,60 +20,61 @@ import java.util.List;
@Service
public class UserDicServiceImpl implements UserDicService {
- @Resource private UserDicMapper userDicMapper;
+ @Resource
+ private UserDicMapper userDicMapper;
- @Override
- public List getAllDic() {
+ @Override
+ public List getAllDic() {
- return userDicMapper.getAllDic();
- }
-
- @Override
- public List getAll() {
- return userDicMapper.getAll();
- }
-
- @Override
- public void addDic(String dic) {
- userDicMapper.addDic(dic);
- writeUserDic();
- }
-
- @Override
- public void deleteDic(String id) {
- userDicMapper.deleteDic(id);
- writeUserDic();
- }
-
- @Override
- public void updateDic(UserDic userDic) {
- userDicMapper.updateDic(userDic.getId(), userDic.getDic());
- writeUserDic();
- }
-
- @Override
- public void writeUserDic() {
- try {
- String filePath = "lucene/userDic/";
- File file = new File(filePath);
- if (!file.exists()) {
- file.mkdirs();
- }
- FileOutputStream stream = new FileOutputStream(file + "/userDic.dic", false);
- OutputStreamWriter outfw = new OutputStreamWriter(stream, StandardCharsets.UTF_8);
- PrintWriter fw = new PrintWriter(new BufferedWriter(outfw));
- userDicMapper
- .getAllDic()
- .forEach(
- each -> {
- fw.write(each);
- fw.write("\r\n");
- });
- fw.flush();
- fw.close();
- Dictionary.getSingleton().updateUserDict();
- } catch (IOException e) {
- e.printStackTrace();
+ return userDicMapper.getAllDic();
+ }
+
+ @Override
+ public List getAll() {
+ return userDicMapper.getAll();
+ }
+
+ @Override
+ public void addDic(String dic) {
+ userDicMapper.addDic(dic);
+ writeUserDic();
+ }
+
+ @Override
+ public void deleteDic(String id) {
+ userDicMapper.deleteDic(id);
+ writeUserDic();
+ }
+
+ @Override
+ public void updateDic(UserDic userDic) {
+ userDicMapper.updateDic(userDic.getId(), userDic.getDic());
+ writeUserDic();
+ }
+
+ @Override
+ public void writeUserDic() {
+ try {
+ String filePath = "lucene/userDic/";
+ File file = new File(filePath);
+ if (!file.exists()) {
+ file.mkdirs();
+ }
+ FileOutputStream stream = new FileOutputStream(file + "/userDic.dic", false);
+ OutputStreamWriter outfw = new OutputStreamWriter(stream, StandardCharsets.UTF_8);
+ PrintWriter fw = new PrintWriter(new BufferedWriter(outfw));
+ userDicMapper
+ .getAllDic()
+ .forEach(
+ each -> {
+ fw.write(each);
+ fw.write("\r\n");
+ });
+ fw.flush();
+ fw.close();
+ Dictionary.getSingleton().updateUserDict();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
- }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/service/impl/UserLuceneServiceImpl.java b/src/main/java/com/rymcu/forest/lucene/service/impl/UserLuceneServiceImpl.java
index 61212b3..243d0e7 100644
--- a/src/main/java/com/rymcu/forest/lucene/service/impl/UserLuceneServiceImpl.java
+++ b/src/main/java/com/rymcu/forest/lucene/service/impl/UserLuceneServiceImpl.java
@@ -41,148 +41,149 @@ import java.util.concurrent.Executors;
@Service
public class UserLuceneServiceImpl implements UserLuceneService {
- @Resource private UserLuceneMapper userLuceneMapper;
+ @Resource
+ private UserLuceneMapper userLuceneMapper;
- /**
- * 将文章的数据解析为一个个关键字词存储到索引文件中
- *
- * @param list
- */
- @Override
- public void writeUser(List list) {
- try {
- int totalCount = list.size();
- int perThreadCount = 3000;
- int threadCount = totalCount / perThreadCount + (totalCount % perThreadCount == 0 ? 0 : 1);
- ExecutorService pool = Executors.newFixedThreadPool(threadCount);
- CountDownLatch countDownLatch1 = new CountDownLatch(1);
- CountDownLatch countDownLatch2 = new CountDownLatch(threadCount);
+ /**
+ * 将文章的数据解析为一个个关键字词存储到索引文件中
+ *
+ * @param list
+ */
+ @Override
+ public void writeUser(List list) {
+ try {
+ int totalCount = list.size();
+ int perThreadCount = 3000;
+ int threadCount = totalCount / perThreadCount + (totalCount % perThreadCount == 0 ? 0 : 1);
+ ExecutorService pool = Executors.newFixedThreadPool(threadCount);
+ CountDownLatch countDownLatch1 = new CountDownLatch(1);
+ CountDownLatch countDownLatch2 = new CountDownLatch(threadCount);
- for (int i = 0; i < threadCount; i++) {
- int start = i * perThreadCount;
- int end = Math.min((i + 1) * perThreadCount, totalCount);
- List subList = list.subList(start, end);
- Runnable runnable =
- new UserBeanIndex(LucenePath.USER_PATH, i, countDownLatch1, countDownLatch2, subList);
- // 子线程交给线程池管理
- pool.execute(runnable);
- }
- countDownLatch1.countDown();
- System.out.println("开始创建索引");
- // 等待所有线程都完成
- countDownLatch2.await();
- // 线程全部完成工作
- System.out.println("所有线程都创建索引完毕");
- // 释放线程池资源
- pool.shutdown();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- @Override
- public void writeUser(String id) {
- writeUser(userLuceneMapper.getById(id));
- }
-
- @Override
- public void writeUser(UserLucene UserLucene) {
- UserIndexUtil.addIndex(UserLucene);
- }
-
- @Override
- public void updateUser(String id) {
- UserIndexUtil.updateIndex(userLuceneMapper.getById(id));
- }
-
- @Override
- public void deleteUser(String id) {
- UserIndexUtil.deleteIndex(id);
- }
-
- @Override
- public List searchUser(String value) {
- List resList = new ArrayList<>();
- ExecutorService service = Executors.newCachedThreadPool();
- // 定义分词器
- Analyzer analyzer = new IKAnalyzer();
- try {
- IndexSearcher searcher = SearchUtil.getIndexSearcherByParentPath(LucenePath.USER_PATH, service);
- String[] fields = {"nickname", "signature"};
- // 构造Query对象
- MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
-
- BufferedReader in =
- new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
- String line = value != null ? value : in.readLine();
- Query query = parser.parse(line);
- // 最终被分词后添加的前缀和后缀处理器,默认是粗体
- SimpleHTMLFormatter htmlFormatter =
- new SimpleHTMLFormatter("", "");
- // 高亮搜索的词添加到高亮处理器中
- Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
-
- // 获取搜索的结果,指定返回document返回的个数
- // TODO 默认搜索结果为显示第一页,1000 条,可以优化
- TopDocs results = SearchUtil.getScoreDocsByPerPage(1, 100, searcher, query);
- ScoreDoc[] hits = results.scoreDocs;
-
- // 遍历,输出
- for (ScoreDoc hit : hits) {
- int id = hit.doc;
- float score = hit.score;
- Document hitDoc = searcher.doc(hit.doc);
- // 获取到 signature
- String signature = hitDoc.get("signature");
- // 将查询的词和搜索词匹配,匹配到添加前缀和后缀
- TokenStream tokenStream = TokenSources.getTokenStream("signature", searcher.getIndexReader().getTermVectors(id), signature, analyzer, -1);
-
- // 传入的第二个参数是查询的值
- TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, signature, false, 10);
- StringBuilder baikeValue = new StringBuilder();
- for (TextFragment textFragment : frag) {
- if ((textFragment != null) && (textFragment.getScore() > 0)) {
- // if ((frag[j] != null)) {
- // 获取 summary 的值
- baikeValue.append(textFragment.toString());
- }
+ for (int i = 0; i < threadCount; i++) {
+ int start = i * perThreadCount;
+ int end = Math.min((i + 1) * perThreadCount, totalCount);
+ List subList = list.subList(start, end);
+ Runnable runnable =
+ new UserBeanIndex(LucenePath.USER_PATH, i, countDownLatch1, countDownLatch2, subList);
+ // 子线程交给线程池管理
+ pool.execute(runnable);
+ }
+ countDownLatch1.countDown();
+ System.out.println("开始创建索引");
+ // 等待所有线程都完成
+ countDownLatch2.await();
+ // 线程全部完成工作
+ System.out.println("所有线程都创建索引完毕");
+ // 释放线程池资源
+ pool.shutdown();
+ } catch (Exception e) {
+ e.printStackTrace();
}
- // 获取到 nickname
- String nickname = hitDoc.get("nickname");
- TokenStream titleTokenStream = TokenSources.getTokenStream("nickname", searcher.getIndexReader().getTermVectors(id), nickname, analyzer, -1);
- TextFragment[] titleFrag =
- highlighter.getBestTextFragments(titleTokenStream, nickname, false, 10);
- StringBuilder titleValue = new StringBuilder();
- for (int j = 0; j < titleFrag.length; j++) {
- if ((frag[j] != null)) {
- titleValue.append(titleFrag[j].toString());
- }
- }
- resList.add(
- UserLucene.builder()
- .idUser(Long.valueOf(hitDoc.get("id")))
- .nickname(titleValue.toString())
- .signature(baikeValue.toString())
- .score(String.valueOf(score))
- .build());
- }
- } catch (IOException | ParseException | InvalidTokenOffsetsException e) {
- System.out.println(e.getMessage());
- e.printStackTrace();
- } finally {
- service.shutdownNow();
}
- return resList;
- }
- @Override
- public List getAllUserLucene() {
- return userLuceneMapper.getAllUserLucene();
- }
+ @Override
+ public void writeUser(String id) {
+ writeUser(userLuceneMapper.getById(id));
+ }
- @Override
- public List getUsersByIds(Long[] ids) {
- return userLuceneMapper.getUsersByIds(ids);
- }
+ @Override
+ public void writeUser(UserLucene UserLucene) {
+ UserIndexUtil.addIndex(UserLucene);
+ }
+
+ @Override
+ public void updateUser(String id) {
+ UserIndexUtil.updateIndex(userLuceneMapper.getById(id));
+ }
+
+ @Override
+ public void deleteUser(String id) {
+ UserIndexUtil.deleteIndex(id);
+ }
+
+ @Override
+ public List searchUser(String value) {
+ List resList = new ArrayList<>();
+ ExecutorService service = Executors.newCachedThreadPool();
+ // 定义分词器
+ Analyzer analyzer = new IKAnalyzer();
+ try {
+ IndexSearcher searcher = SearchUtil.getIndexSearcherByParentPath(LucenePath.USER_PATH, service);
+ String[] fields = {"nickname", "signature"};
+ // 构造Query对象
+ MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
+
+ BufferedReader in =
+ new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
+ String line = value != null ? value : in.readLine();
+ Query query = parser.parse(line);
+ // 最终被分词后添加的前缀和后缀处理器,默认是粗体
+ SimpleHTMLFormatter htmlFormatter =
+ new SimpleHTMLFormatter("", "");
+ // 高亮搜索的词添加到高亮处理器中
+ Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
+
+ // 获取搜索的结果,指定返回document返回的个数
+ // TODO 默认搜索结果为显示第一页,1000 条,可以优化
+ TopDocs results = SearchUtil.getScoreDocsByPerPage(1, 100, searcher, query);
+ ScoreDoc[] hits = results.scoreDocs;
+
+ // 遍历,输出
+ for (ScoreDoc hit : hits) {
+ int id = hit.doc;
+ float score = hit.score;
+ Document hitDoc = searcher.doc(hit.doc);
+ // 获取到 signature
+ String signature = hitDoc.get("signature");
+ // 将查询的词和搜索词匹配,匹配到添加前缀和后缀
+ TokenStream tokenStream = TokenSources.getTokenStream("signature", searcher.getIndexReader().getTermVectors(id), signature, analyzer, -1);
+
+ // 传入的第二个参数是查询的值
+ TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, signature, false, 10);
+ StringBuilder baikeValue = new StringBuilder();
+ for (TextFragment textFragment : frag) {
+ if ((textFragment != null) && (textFragment.getScore() > 0)) {
+ // if ((frag[j] != null)) {
+ // 获取 summary 的值
+ baikeValue.append(textFragment.toString());
+ }
+ }
+ // 获取到 nickname
+ String nickname = hitDoc.get("nickname");
+ TokenStream titleTokenStream = TokenSources.getTokenStream("nickname", searcher.getIndexReader().getTermVectors(id), nickname, analyzer, -1);
+ TextFragment[] titleFrag =
+ highlighter.getBestTextFragments(titleTokenStream, nickname, false, 10);
+ StringBuilder titleValue = new StringBuilder();
+ for (int j = 0; j < titleFrag.length; j++) {
+ if ((frag[j] != null)) {
+ titleValue.append(titleFrag[j].toString());
+ }
+ }
+ resList.add(
+ UserLucene.builder()
+ .idUser(Long.valueOf(hitDoc.get("id")))
+ .nickname(titleValue.toString())
+ .signature(baikeValue.toString())
+ .score(String.valueOf(score))
+ .build());
+ }
+ } catch (IOException | ParseException | InvalidTokenOffsetsException e) {
+ System.out.println(e.getMessage());
+ e.printStackTrace();
+ } finally {
+ service.shutdownNow();
+ }
+ return resList;
+ }
+
+ @Override
+ public List getAllUserLucene() {
+ return userLuceneMapper.getAllUserLucene();
+ }
+
+ @Override
+ public List getUsersByIds(Long[] ids) {
+ return userLuceneMapper.getUsersByIds(ids);
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java b/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java
index be66434..266603b 100644
--- a/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java
+++ b/src/main/java/com/rymcu/forest/lucene/util/IndexUtil.java
@@ -22,12 +22,13 @@ import java.nio.file.Paths;
public class IndexUtil {
/**
* 创建索引写入器
+ *
* @param indexPath
* @param create
* @return
* @throws IOException
*/
- public static IndexWriter getIndexWriter(String indexPath,boolean create) throws IOException{
+ public static IndexWriter getIndexWriter(String indexPath, boolean create) throws IOException {
Directory dir = FSDirectory.open(Paths.get(indexPath));
Analyzer analyzer = new IKAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
@@ -40,9 +41,9 @@ public class IndexUtil {
//值较小有利于追加索引的速度
//值较大,适合批量建立索引和更快的搜索
mergePolicy.setMaxMergeDocs(5000);
- if (create){
+ if (create) {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
- }else {
+ } else {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
}
return new IndexWriter(dir, iwc);
diff --git a/src/main/java/com/rymcu/forest/lucene/util/LucenePath.java b/src/main/java/com/rymcu/forest/lucene/util/LucenePath.java
index 0595bd0..d09ba56 100644
--- a/src/main/java/com/rymcu/forest/lucene/util/LucenePath.java
+++ b/src/main/java/com/rymcu/forest/lucene/util/LucenePath.java
@@ -7,27 +7,41 @@ package com.rymcu.forest.lucene.util;
*/
public final class LucenePath {
- /** lucene 目录 */
- public static final String INDEX_PATH = "/lucene/index";
+ /**
+ * lucene 目录
+ */
+ public static final String INDEX_PATH = "/lucene/index";
- /** 文章 lucene 目录 */
- public static final String ARTICLE_INDEX_PATH = INDEX_PATH + "/article";
+ /**
+ * 文章 lucene 目录
+ */
+ public static final String ARTICLE_INDEX_PATH = INDEX_PATH + "/article";
- /** 文章增量 lucene 目录 */
- public static final String ARTICLE_INCREMENT_INDEX_PATH =
- System.getProperty("user.dir") + ARTICLE_INDEX_PATH + "/index777";
+ /**
+ * 文章增量 lucene 目录
+ */
+ public static final String ARTICLE_INCREMENT_INDEX_PATH =
+ System.getProperty("user.dir") + ARTICLE_INDEX_PATH + "/index777";
- /** 用户 lucene 目录 */
- public static final String USER_PATH = INDEX_PATH + "/user";
+ /**
+ * 用户 lucene 目录
+ */
+ public static final String USER_PATH = INDEX_PATH + "/user";
- /** 用户增量 lucene 目录 */
- public static final String USER_INCREMENT_INDEX_PATH =
- System.getProperty("user.dir") + USER_PATH + "/index777";
+ /**
+ * 用户增量 lucene 目录
+ */
+ public static final String USER_INCREMENT_INDEX_PATH =
+ System.getProperty("user.dir") + USER_PATH + "/index777";
- /** 作品集 lucene 目录 */
- public static final String PORTFOLIO_PATH = INDEX_PATH + "/portfolio";
+ /**
+ * 作品集 lucene 目录
+ */
+ public static final String PORTFOLIO_PATH = INDEX_PATH + "/portfolio";
- /** 作品集增量 lucene 目录 */
- public static final String PORTFOLIO_INCREMENT_INDEX_PATH =
- System.getProperty("user.dir") + PORTFOLIO_PATH + "/index777";
+ /**
+ * 作品集增量 lucene 目录
+ */
+ public static final String PORTFOLIO_INCREMENT_INDEX_PATH =
+ System.getProperty("user.dir") + PORTFOLIO_PATH + "/index777";
}
diff --git a/src/main/java/com/rymcu/forest/lucene/util/PortfolioIndexUtil.java b/src/main/java/com/rymcu/forest/lucene/util/PortfolioIndexUtil.java
index a36cda4..7b3f84f 100644
--- a/src/main/java/com/rymcu/forest/lucene/util/PortfolioIndexUtil.java
+++ b/src/main/java/com/rymcu/forest/lucene/util/PortfolioIndexUtil.java
@@ -20,65 +20,71 @@ import java.util.Arrays;
*/
public class PortfolioIndexUtil {
- /** lucene索引保存目录 */
- private static final String PATH =
- System.getProperty("user.dir") + StrUtil.SLASH + LucenePath.PORTFOLIO_PATH;
+ /**
+ * lucene索引保存目录
+ */
+ private static final String PATH =
+ System.getProperty("user.dir") + StrUtil.SLASH + LucenePath.PORTFOLIO_PATH;
- /** 删除所有运行中保存的索引 */
- public static void deleteAllIndex() {
- if (FileUtil.exist(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH)) {
- FileUtil.del(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH);
+ /**
+ * 删除所有运行中保存的索引
+ */
+ public static void deleteAllIndex() {
+ if (FileUtil.exist(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH)) {
+ FileUtil.del(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH);
+ }
}
- }
- public static void addIndex(PortfolioLucene t) {
- creatIndex(t);
- }
-
- public static void updateIndex(PortfolioLucene t) {
- deleteIndex(t.getIdPortfolio());
- creatIndex(t);
- }
-
- /**
- * 增加或创建单个索引
- *
- * @param t
- * @throws Exception
- */
- private static synchronized void creatIndex(PortfolioLucene t) {
- System.out.println("创建单个索引");
- IndexWriter writer;
- try {
- writer = IndexUtil.getIndexWriter(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH, false);
- Document doc = new Document();
- doc.add(new StringField("id", t.getIdPortfolio() + "", Field.Store.YES));
- doc.add(new TextField("title", t.getPortfolioTitle(), Field.Store.YES));
- doc.add(new TextField("summary", t.getPortfolioDescription(), Field.Store.YES));
- writer.addDocument(doc);
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
+ public static void addIndex(PortfolioLucene t) {
+ creatIndex(t);
}
- }
- /** 删除单个索引 */
- public static synchronized void deleteIndex(Long id) {
- Arrays.stream(FileUtil.ls(PATH))
- .forEach(
- each -> {
- if (each.isDirectory()) {
- IndexWriter writer;
- try {
- writer = IndexUtil.getIndexWriter(each.getAbsolutePath(), false);
- writer.deleteDocuments(new Term("id", String.valueOf(id)));
- writer.forceMergeDeletes(); // 强制删除
- writer.commit();
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- });
- }
+ public static void updateIndex(PortfolioLucene t) {
+ deleteIndex(t.getIdPortfolio());
+ creatIndex(t);
+ }
+
+ /**
+ * 增加或创建单个索引
+ *
+ * @param t
+ * @throws Exception
+ */
+ private static synchronized void creatIndex(PortfolioLucene t) {
+ System.out.println("创建单个索引");
+ IndexWriter writer;
+ try {
+ writer = IndexUtil.getIndexWriter(LucenePath.PORTFOLIO_INCREMENT_INDEX_PATH, false);
+ Document doc = new Document();
+ doc.add(new StringField("id", t.getIdPortfolio() + "", Field.Store.YES));
+ doc.add(new TextField("title", t.getPortfolioTitle(), Field.Store.YES));
+ doc.add(new TextField("summary", t.getPortfolioDescription(), Field.Store.YES));
+ writer.addDocument(doc);
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * 删除单个索引
+ */
+ public static synchronized void deleteIndex(Long id) {
+ Arrays.stream(FileUtil.ls(PATH))
+ .forEach(
+ each -> {
+ if (each.isDirectory()) {
+ IndexWriter writer;
+ try {
+ writer = IndexUtil.getIndexWriter(each.getAbsolutePath(), false);
+ writer.deleteDocuments(new Term("id", String.valueOf(id)));
+ writer.forceMergeDeletes(); // 强制删除
+ writer.commit();
+ writer.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ });
+ }
}
diff --git a/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java b/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java
index 3f84d08..68250a7 100644
--- a/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java
+++ b/src/main/java/com/rymcu/forest/lucene/util/SearchUtil.java
@@ -23,135 +23,139 @@ import java.util.concurrent.ExecutorService;
* @date 2021/2/2 14:04
*/
public class SearchUtil {
- /**
- * 获取IndexSearcher对象
- *
- * @param service
- * @return
- * @throws IOException
- */
- public static IndexSearcher getIndexSearcherByParentPath(
- String parentPath, ExecutorService service) {
- MultiReader reader = null;
- // 设置
- try {
- File[] files = new File(parentPath).listFiles();
- IndexReader[] readers = new IndexReader[files.length];
- for (int i = 0; i < files.length; i++) {
- readers[i] =
- DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath())));
- }
- reader = new MultiReader(readers);
- } catch (IOException e) {
- e.printStackTrace();
+ /**
+ * 获取IndexSearcher对象
+ *
+ * @param service
+ * @return
+ * @throws IOException
+ */
+ public static IndexSearcher getIndexSearcherByParentPath(
+ String parentPath, ExecutorService service) {
+ MultiReader reader = null;
+ // 设置
+ try {
+ File[] files = new File(parentPath).listFiles();
+ IndexReader[] readers = new IndexReader[files.length];
+ for (int i = 0; i < files.length; i++) {
+ readers[i] =
+ DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath())));
+ }
+ reader = new MultiReader(readers);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return new IndexSearcher(reader, service);
}
- return new IndexSearcher(reader, service);
- }
- /**
- * 根据索引路径获取IndexReader
- *
- * @param indexPath
- * @return
- * @throws IOException
- */
- public static DirectoryReader getIndexReader(String indexPath) throws IOException {
- return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
- }
- /**
- * 根据索引路径获取IndexSearcher
- *
- * @param indexPath
- * @param service
- * @return
- * @throws IOException
- */
- public static IndexSearcher getIndexSearcherByIndexPath(String indexPath, ExecutorService service)
- throws IOException {
- IndexReader reader = getIndexReader(indexPath);
- return new IndexSearcher(reader, service);
- }
- /**
- * 如果索引目录会有变更用此方法获取新的IndexSearcher这种方式会占用较少的资源
- *
- * @param oldSearcher
- * @param service
- * @return
- * @throws IOException
- */
- public static IndexSearcher getIndexSearcherOpenIfChanged(
- IndexSearcher oldSearcher, ExecutorService service) throws IOException {
- DirectoryReader reader = (DirectoryReader) oldSearcher.getIndexReader();
- DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
- return new IndexSearcher(newReader, service);
- }
-
- /**
- * 根据IndexSearcher和docID获取默认的document
- *
- * @param searcher
- * @param docID
- * @return
- * @throws IOException
- */
- public static Document getDefaultFullDocument(IndexSearcher searcher, int docID)
- throws IOException {
- return searcher.doc(docID);
- }
- /**
- * 根据IndexSearcher和docID
- *
- * @param searcher
- * @param docID
- * @param listField
- * @return
- * @throws IOException
- */
- public static Document getDocumentByListField(
- IndexSearcher searcher, int docID, Set listField) throws IOException {
- return searcher.doc(docID, listField);
- }
-
- /**
- * 分页查询
- *
- * @param page 当前页数
- * @param perPage 每页显示条数
- * @param searcher searcher查询器
- * @param query 查询条件
- * @return
- * @throws IOException
- */
- public static TopDocs getScoreDocsByPerPage(
- int page, int perPage, IndexSearcher searcher, Query query) throws IOException {
- TopDocs result = null;
- if (query == null) {
- System.out.println(" Query is null return null ");
- return null;
+ /**
+ * 根据索引路径获取IndexReader
+ *
+ * @param indexPath
+ * @return
+ * @throws IOException
+ */
+ public static DirectoryReader getIndexReader(String indexPath) throws IOException {
+ return DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
}
- ScoreDoc before = null;
- if (page != 1) {
- TopDocs docsBefore = searcher.search(query, (page - 1) * perPage);
- ScoreDoc[] scoreDocs = docsBefore.scoreDocs;
- if (scoreDocs.length > 0) {
- before = scoreDocs[scoreDocs.length - 1];
- }
- }
- result = searcher.searchAfter(before, query, perPage);
- return result;
- }
- public static TopDocs getScoreDocs(IndexSearcher searcher, Query query) throws IOException {
- TopDocs docs = searcher.search(query, getMaxDocId(searcher));
- return docs;
- }
- /**
- * 统计document的数量,此方法等同于matchAllDocsQuery查询
- *
- * @param searcher
- * @return
- */
- public static int getMaxDocId(IndexSearcher searcher) {
- return searcher.getIndexReader().maxDoc();
- }
+ /**
+ * 根据索引路径获取IndexSearcher
+ *
+ * @param indexPath
+ * @param service
+ * @return
+ * @throws IOException
+ */
+ public static IndexSearcher getIndexSearcherByIndexPath(String indexPath, ExecutorService service)
+ throws IOException {
+ IndexReader reader = getIndexReader(indexPath);
+ return new IndexSearcher(reader, service);
+ }
+
+ /**
+ * 如果索引目录会有变更用此方法获取新的IndexSearcher这种方式会占用较少的资源
+ *
+ * @param oldSearcher
+ * @param service
+ * @return
+ * @throws IOException
+ */
+ public static IndexSearcher getIndexSearcherOpenIfChanged(
+ IndexSearcher oldSearcher, ExecutorService service) throws IOException {
+ DirectoryReader reader = (DirectoryReader) oldSearcher.getIndexReader();
+ DirectoryReader newReader = DirectoryReader.openIfChanged(reader);
+ return new IndexSearcher(newReader, service);
+ }
+
+ /**
+ * 根据IndexSearcher和docID获取默认的document
+ *
+ * @param searcher
+ * @param docID
+ * @return
+ * @throws IOException
+ */
+ public static Document getDefaultFullDocument(IndexSearcher searcher, int docID)
+ throws IOException {
+ return searcher.doc(docID);
+ }
+
+ /**
+ * 根据IndexSearcher和docID
+ *
+ * @param searcher
+ * @param docID
+ * @param listField
+ * @return
+ * @throws IOException
+ */
+ public static Document getDocumentByListField(
+ IndexSearcher searcher, int docID, Set listField) throws IOException {
+ return searcher.doc(docID, listField);
+ }
+
+ /**
+ * 分页查询
+ *
+ * @param page 当前页数
+ * @param perPage 每页显示条数
+ * @param searcher searcher查询器
+ * @param query 查询条件
+ * @return
+ * @throws IOException
+ */
+ public static TopDocs getScoreDocsByPerPage(
+ int page, int perPage, IndexSearcher searcher, Query query) throws IOException {
+ TopDocs result = null;
+ if (query == null) {
+ System.out.println(" Query is null return null ");
+ return null;
+ }
+ ScoreDoc before = null;
+ if (page != 1) {
+ TopDocs docsBefore = searcher.search(query, (page - 1) * perPage);
+ ScoreDoc[] scoreDocs = docsBefore.scoreDocs;
+ if (scoreDocs.length > 0) {
+ before = scoreDocs[scoreDocs.length - 1];
+ }
+ }
+ result = searcher.searchAfter(before, query, perPage);
+ return result;
+ }
+
+ public static TopDocs getScoreDocs(IndexSearcher searcher, Query query) throws IOException {
+ TopDocs docs = searcher.search(query, getMaxDocId(searcher));
+ return docs;
+ }
+
+ /**
+ * 统计document的数量,此方法等同于matchAllDocsQuery查询
+ *
+ * @param searcher
+ * @return
+ */
+ public static int getMaxDocId(IndexSearcher searcher) {
+ return searcher.getIndexReader().maxDoc();
+ }
}
diff --git a/src/main/java/com/rymcu/forest/mapper/ArticleMapper.java b/src/main/java/com/rymcu/forest/mapper/ArticleMapper.java
index 53aa3e5..6ec1905 100644
--- a/src/main/java/com/rymcu/forest/mapper/ArticleMapper.java
+++ b/src/main/java/com/rymcu/forest/mapper/ArticleMapper.java
@@ -17,6 +17,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取文章列表
+ *
* @param searchText
* @param tag
* @param topicUri
@@ -26,6 +27,7 @@ public interface ArticleMapper extends Mapper {
/**
* 根据文章 ID 查询文章
+ *
* @param id
* @param type
* @return
@@ -34,6 +36,7 @@ public interface ArticleMapper extends Mapper {
/**
* 保存文章内容
+ *
* @param idArticle
* @param articleContent
* @param articleContentHtml
@@ -43,6 +46,7 @@ public interface ArticleMapper extends Mapper {
/**
* 更新文章内容
+ *
* @param idArticle
* @param articleContent
* @param articleContentHtml
@@ -52,6 +56,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取文章正文内容
+ *
* @param idArticle
* @return
*/
@@ -59,6 +64,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取主题下文章列表
+ *
* @param topicName
* @return
*/
@@ -66,6 +72,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取标签下文章列表
+ *
* @param tagName
* @return
*/
@@ -73,6 +80,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取用户文章列表
+ *
* @param idUser
* @return
*/
@@ -80,6 +88,7 @@ public interface ArticleMapper extends Mapper {
/**
* 删除文章标签
+ *
* @param id
* @return
*/
@@ -87,6 +96,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取文章标签列表
+ *
* @param idArticle
* @return
*/
@@ -94,6 +104,7 @@ public interface ArticleMapper extends Mapper {
/**
* 更新文章浏览数
+ *
* @param id
* @param articleViewCount
* @return
@@ -102,6 +113,7 @@ public interface ArticleMapper extends Mapper {
/**
* 获取草稿列表
+ *
* @param idUser
* @return
*/
@@ -109,6 +121,7 @@ public interface ArticleMapper extends Mapper {
/**
* 删除未使用的文章标签
+ *
* @param idArticleTag
* @return
*/
@@ -116,6 +129,7 @@ public interface ArticleMapper extends Mapper {
/**
* 查询作品集下文章
+ *
* @param idPortfolio
* @return
*/
@@ -123,6 +137,7 @@ public interface ArticleMapper extends Mapper {
/**
* 查询作品集未绑定文章
+ *
* @param idPortfolio
* @param searchText
* @param idUser
@@ -132,6 +147,7 @@ public interface ArticleMapper extends Mapper {
/**
* 查询文章所属作品集列表
+ *
* @param idArticle
* @return
*/
@@ -139,6 +155,7 @@ public interface ArticleMapper extends Mapper {
/**
* 更新文章标签
+ *
* @param idArticle
* @param tags
* @return
@@ -147,6 +164,7 @@ public interface ArticleMapper extends Mapper {
/**
* 判断是否有评论
+ *
* @param id
* @return
*/
@@ -154,6 +172,7 @@ public interface ArticleMapper extends Mapper {
/**
* 删除关联作品集数据
+ *
* @param id
* @return
*/
@@ -161,6 +180,7 @@ public interface ArticleMapper extends Mapper {
/**
* 更新文章连接及预览内容
+ *
* @param idArticle
* @param articleLink
* @param articlePermalink
@@ -171,6 +191,7 @@ public interface ArticleMapper extends Mapper {
/**
* 根据专题主键及当前文章排序号获取专题下文章大纲
+ *
* @param idPortfolio
* @param sortNo
* @return
@@ -179,6 +200,7 @@ public interface ArticleMapper extends Mapper {
/**
* 更新文章优选状态
+ *
* @param idArticle
* @param articlePerfect
* @return
@@ -187,12 +209,14 @@ public interface ArticleMapper extends Mapper {
/**
* 删除文章关联文章内容表信息
+ *
* @param idArticle
*/
void deleteArticleContent(@Param("idArticle") Long idArticle);
/**
* 获取公告
+ *
* @return
*/
List selectAnnouncements();
diff --git a/src/main/java/com/rymcu/forest/mapper/ArticleThumbsUpMapper.java b/src/main/java/com/rymcu/forest/mapper/ArticleThumbsUpMapper.java
index 7d6f78d..a377552 100644
--- a/src/main/java/com/rymcu/forest/mapper/ArticleThumbsUpMapper.java
+++ b/src/main/java/com/rymcu/forest/mapper/ArticleThumbsUpMapper.java
@@ -10,6 +10,7 @@ import org.apache.ibatis.annotations.Param;
public interface ArticleThumbsUpMapper extends Mapper {
/**
* 更新文章点赞数
+ *
* @param idArticle
* @param thumbsUpNumber
* @return
diff --git a/src/main/java/com/rymcu/forest/mapper/BankAccountMapper.java b/src/main/java/com/rymcu/forest/mapper/BankAccountMapper.java
index af451f7..1568cc1 100644
--- a/src/main/java/com/rymcu/forest/mapper/BankAccountMapper.java
+++ b/src/main/java/com/rymcu/forest/mapper/BankAccountMapper.java
@@ -13,6 +13,7 @@ import java.util.List;
public interface BankAccountMapper extends Mapper {
/**
* 查询银行账户
+ *
* @param bankName
* @param accountOwnerName
* @param bankAccount
@@ -22,6 +23,7 @@ public interface BankAccountMapper extends Mapper {
/**
* 获取银行账户信息
+ *
* @param idBank
* @return
*/
@@ -29,12 +31,14 @@ public interface BankAccountMapper extends Mapper {
/**
* 获取当前最大卡号
+ *
* @return
*/
String selectMaxBankAccount();
/**
* 根据卡号获取银行账号信息
+ *
* @param bankAccount
* @return
*/
@@ -42,6 +46,7 @@ public interface BankAccountMapper extends Mapper {
/**
* 查询用户个人银行账户信息
+ *
* @param idUser
* @return
*/
diff --git a/src/main/java/com/rymcu/forest/mapper/BankMapper.java b/src/main/java/com/rymcu/forest/mapper/BankMapper.java
index d330d8c..c95bfc6 100644
--- a/src/main/java/com/rymcu/forest/mapper/BankMapper.java
+++ b/src/main/java/com/rymcu/forest/mapper/BankMapper.java
@@ -12,6 +12,7 @@ import java.util.List;
public interface BankMapper extends Mapper {
/**
* 查询银行列表数据
+ *
* @return
*/
List selectBanks();
diff --git a/src/main/java/com/rymcu/forest/mapper/CommentMapper.java b/src/main/java/com/rymcu/forest/mapper/CommentMapper.java
index 8f4b76e..7d97d6c 100644
--- a/src/main/java/com/rymcu/forest/mapper/CommentMapper.java
+++ b/src/main/java/com/rymcu/forest/mapper/CommentMapper.java
@@ -14,6 +14,7 @@ import java.util.List;
public interface CommentMapper extends Mapper {
/**
* 获取文章评论列表
+ *
* @param idArticle
* @return
*/
@@ -21,6 +22,7 @@ public interface CommentMapper extends Mapper {
/**
* 查询评论作者
+ *
* @param commentAuthorId
* @return
*/
@@ -28,6 +30,7 @@ public interface CommentMapper extends Mapper {
/**
* 查询父评论作者
+ *
* @param commentOriginalCommentId
* @return
*/
@@ -35,6 +38,7 @@ public interface CommentMapper extends Mapper {
/**
* 更新文章评论分享链接
+ *
* @param idComment
* @param commentSharpUrl
* @return
@@ -43,6 +47,7 @@ public interface CommentMapper extends Mapper {
/**
* 获取评论列表数据
+ *
* @return
*/
List selectComments();
diff --git a/src/main/java/com/rymcu/forest/mapper/DashboardMapper.java b/src/main/java/com/rymcu/forest/mapper/DashboardMapper.java
index 73f1cbf..b0db7d0 100644
--- a/src/main/java/com/rymcu/forest/mapper/DashboardMapper.java
+++ b/src/main/java/com/rymcu/forest/mapper/DashboardMapper.java
@@ -13,102 +13,119 @@ import java.util.List;
public interface DashboardMapper {
/**
* 获取总用户数
+ *
* @return
- * */
+ */
Integer selectUserCount();
/**
* 获取新注册用户数
+ *
* @return
- * */
+ */
Integer selectNewUserCount();
/**
* 获取文章总数
+ *
* @return
*/
Integer selectArticleCount();
/**
* 获取今日发布文章总数
+ *
* @return
*/
Integer selectNewArticleCount();
/**
* 获取浏览量总数
+ *
* @return
*/
Integer selectCountViewNum();
/**
* 获取今日浏览总数
+ *
* @return
*/
Integer selectTodayViewNum();
/**
* 获取最近 30 天文章数据
+ *
* @return
*/
List selectLastThirtyDaysArticleData();
/**
* 获取最近 30 天用户数据
+ *
* @return
*/
List selectLastThirtyDaysUserData();
/**
* 获取最近 30 天流量数据
+ *
* @return
*/
List selectLastThirtyDaysVisitData();
/**
* 获取历史 1 年文章数据
+ *
* @return
*/
List selectHistoryArticleData();
/**
* 获取历史 1 年用户数据
+ *
* @return
*/
List selectHistoryUserData();
/**
* 获取历史 1 年访问数据
+ *
* @return
*/
List selectHistoryVisitData();
/**
* 获取新增用户列表
+ *
* @return
*/
List selectNewUsers();
/**
* 获取新增银行账号列表
+ *
* @return
*/
List selectNewBankAccounts();
/**
* 获取新增文章列表
+ *
* @return
*/
List selectNewArticles();
/**
* 获取最近 30 天访客数据
+ *
* @return
*/
List | |